From 932c58a4978a1b260e9ca758d716c3b34dbc92a8 Mon Sep 17 00:00:00 2001 From: James DeFelice Date: Wed, 10 Jun 2015 20:58:22 +0000 Subject: [PATCH] Kubernetes Mesos integration This commit includes the fundamental components of the Kubernetes Mesos integration: * Kubernetes-Mesos scheduler * Kubernetes-Mesos executor * Supporting libs Dependencies and upstream changes are included in a separate commit for easy review. After this initial upstream, there'll be two PRs following. * km (hypercube) and k8sm-controller-manager #9265 * Static pods support #9077 Fixes applied: - Precise metrics subsystems definitions - mesosphere/kubernetes-mesos#331 - https://github.com/GoogleCloudPlatform/kubernetes/pull/8882#discussion_r31875232 - https://github.com/GoogleCloudPlatform/kubernetes/pull/8882#discussion_r31875240 - Improve comments and add clarifications - Fixes https://github.com/GoogleCloudPlatform/kubernetes/pull/8882#discussion-diff-31875208 - Fixes https://github.com/GoogleCloudPlatform/kubernetes/pull/8882#discussion-diff-31875226 - Fixes https://github.com/GoogleCloudPlatform/kubernetes/pull/8882#discussion-diff-31875227 - Fixes https://github.com/GoogleCloudPlatform/kubernetes/pull/8882#discussion-diff-31875228 - Fixes https://github.com/GoogleCloudPlatform/kubernetes/pull/8882#discussion-diff-31875239 - Fixes https://github.com/GoogleCloudPlatform/kubernetes/pull/8882#discussion-diff-31875243 - Fixes https://github.com/GoogleCloudPlatform/kubernetes/pull/8882#discussion-diff-31875234 - Fixes https://github.com/GoogleCloudPlatform/kubernetes/pull/8882#discussion-diff-31875256 - Fixes https://github.com/GoogleCloudPlatform/kubernetes/pull/8882#discussion-diff-31875255 - Fixes https://github.com/GoogleCloudPlatform/kubernetes/pull/8882#discussion-diff-31875251 - Clarify which Schedule function is actually called - Fixes https://github.com/GoogleCloudPlatform/kubernetes/pull/8882#discussion-diff-31875246 --- contrib/mesos/cmd/k8sm-executor/doc.go | 18 + contrib/mesos/cmd/k8sm-executor/main.go | 47 + contrib/mesos/cmd/k8sm-redirfd/doc.go | 21 + contrib/mesos/cmd/k8sm-redirfd/redirfd.go | 105 ++ contrib/mesos/cmd/k8sm-scheduler/doc.go | 18 + contrib/mesos/cmd/k8sm-scheduler/main.go | 46 + contrib/mesos/pkg/assert/assert.go | 43 + contrib/mesos/pkg/assert/doc.go | 19 + contrib/mesos/pkg/backoff/backoff.go | 96 ++ contrib/mesos/pkg/backoff/doc.go | 19 + contrib/mesos/pkg/election/doc.go | 18 + contrib/mesos/pkg/election/etcd_master.go | 185 ++++ .../mesos/pkg/election/etcd_master_test.go | 98 ++ contrib/mesos/pkg/election/fake.go | 53 + contrib/mesos/pkg/election/master.go | 134 +++ contrib/mesos/pkg/election/master_test.go | 98 ++ contrib/mesos/pkg/executor/config/config.go | 29 + contrib/mesos/pkg/executor/config/doc.go | 18 + contrib/mesos/pkg/executor/doc.go | 21 + contrib/mesos/pkg/executor/executor.go | 846 ++++++++++++++++ contrib/mesos/pkg/executor/executor_test.go | 618 ++++++++++++ contrib/mesos/pkg/executor/messages/doc.go | 18 + .../mesos/pkg/executor/messages/messages.go | 32 + contrib/mesos/pkg/executor/mock_test.go | 81 ++ contrib/mesos/pkg/executor/service/doc.go | 18 + contrib/mesos/pkg/executor/service/service.go | 600 ++++++++++++ contrib/mesos/pkg/hyperkube/doc.go | 21 + contrib/mesos/pkg/hyperkube/types.go | 54 + contrib/mesos/pkg/offers/doc.go | 18 + contrib/mesos/pkg/offers/metrics/doc.go | 19 + contrib/mesos/pkg/offers/metrics/metrics.go | 89 ++ contrib/mesos/pkg/offers/offers.go | 570 +++++++++++ contrib/mesos/pkg/offers/offers_test.go | 391 ++++++++ contrib/mesos/pkg/proc/doc.go | 19 + contrib/mesos/pkg/proc/errors.go | 34 + contrib/mesos/pkg/proc/proc.go | 377 +++++++ contrib/mesos/pkg/proc/proc_test.go | 373 +++++++ contrib/mesos/pkg/proc/state.go | 55 ++ contrib/mesos/pkg/proc/types.go | 71 ++ contrib/mesos/pkg/profile/doc.go | 18 + contrib/mesos/pkg/profile/profile.go | 27 + contrib/mesos/pkg/queue/delay.go | 373 +++++++ contrib/mesos/pkg/queue/delay_test.go | 406 ++++++++ contrib/mesos/pkg/queue/doc.go | 19 + contrib/mesos/pkg/queue/historical.go | 403 ++++++++ contrib/mesos/pkg/queue/historical_test.go | 191 ++++ contrib/mesos/pkg/queue/interface.go | 103 ++ contrib/mesos/pkg/queue/policy.go | 70 ++ contrib/mesos/pkg/queue/priority.go | 56 ++ contrib/mesos/pkg/redirfd/doc.go | 19 + contrib/mesos/pkg/redirfd/file_descriptor.go | 41 + .../mesos/pkg/redirfd/file_descriptor_test.go | 54 + contrib/mesos/pkg/redirfd/redirfd_unix.go | 208 ++++ contrib/mesos/pkg/redirfd/redirfd_windows.go | 39 + contrib/mesos/pkg/runtime/doc.go | 19 + contrib/mesos/pkg/runtime/latch.go | 35 + contrib/mesos/pkg/runtime/latch_test.go | 61 ++ contrib/mesos/pkg/runtime/metrics.go | 47 + contrib/mesos/pkg/runtime/util.go | 122 +++ contrib/mesos/pkg/runtime/util_test.go | 64 ++ contrib/mesos/pkg/scheduler/config/config.go | 109 +++ .../mesos/pkg/scheduler/config/config_test.go | 112 +++ contrib/mesos/pkg/scheduler/config/doc.go | 18 + .../pkg/scheduler/constraint/constraint.go | 106 ++ .../scheduler/constraint/constraint_test.go | 79 ++ contrib/mesos/pkg/scheduler/constraint/doc.go | 19 + contrib/mesos/pkg/scheduler/doc.go | 18 + contrib/mesos/pkg/scheduler/fcfs.go | 57 ++ contrib/mesos/pkg/scheduler/ha/doc.go | 18 + contrib/mesos/pkg/scheduler/ha/election.go | 73 ++ contrib/mesos/pkg/scheduler/ha/ha.go | 285 ++++++ .../mesos/pkg/scheduler/meta/annotations.go | 30 + contrib/mesos/pkg/scheduler/meta/doc.go | 19 + contrib/mesos/pkg/scheduler/meta/store.go | 24 + contrib/mesos/pkg/scheduler/metrics/doc.go | 18 + .../mesos/pkg/scheduler/metrics/metrics.go | 102 ++ contrib/mesos/pkg/scheduler/mock_test.go | 203 ++++ contrib/mesos/pkg/scheduler/plugin.go | 875 +++++++++++++++++ contrib/mesos/pkg/scheduler/plugin_test.go | 700 +++++++++++++ contrib/mesos/pkg/scheduler/pod.go | 80 ++ contrib/mesos/pkg/scheduler/podtask/debug.go | 54 + contrib/mesos/pkg/scheduler/podtask/doc.go | 18 + contrib/mesos/pkg/scheduler/podtask/leaky.go | 29 + .../mesos/pkg/scheduler/podtask/pod_task.go | 373 +++++++ .../pkg/scheduler/podtask/pod_task_test.go | 153 +++ .../pkg/scheduler/podtask/port_mapping.go | 185 ++++ .../scheduler/podtask/port_mapping_test.go | 205 ++++ .../mesos/pkg/scheduler/podtask/protobuf.go | 57 ++ .../mesos/pkg/scheduler/podtask/registry.go | 335 +++++++ .../pkg/scheduler/podtask/registry_test.go | 320 ++++++ contrib/mesos/pkg/scheduler/scheduler.go | 924 ++++++++++++++++++ contrib/mesos/pkg/scheduler/scheduler_test.go | 350 +++++++ .../pkg/scheduler/service/compat_testing.go | 32 + .../pkg/scheduler/service/compat_unix.go | 38 + .../pkg/scheduler/service/compat_windows.go | 51 + contrib/mesos/pkg/scheduler/service/doc.go | 18 + .../mesos/pkg/scheduler/service/publish.go | 121 +++ .../mesos/pkg/scheduler/service/service.go | 751 ++++++++++++++ .../pkg/scheduler/service/service_test.go | 108 ++ contrib/mesos/pkg/scheduler/service/util.go | 88 ++ contrib/mesos/pkg/scheduler/types.go | 49 + contrib/mesos/pkg/scheduler/uid/doc.go | 18 + contrib/mesos/pkg/scheduler/uid/uid.go | 85 ++ contrib/mesos/pkg/scheduler/uid/uid_test.go | 47 + contrib/mesos/target.sh | 43 + 105 files changed, 15162 insertions(+) create mode 100644 contrib/mesos/cmd/k8sm-executor/doc.go create mode 100644 contrib/mesos/cmd/k8sm-executor/main.go create mode 100644 contrib/mesos/cmd/k8sm-redirfd/doc.go create mode 100644 contrib/mesos/cmd/k8sm-redirfd/redirfd.go create mode 100644 contrib/mesos/cmd/k8sm-scheduler/doc.go create mode 100644 contrib/mesos/cmd/k8sm-scheduler/main.go create mode 100644 contrib/mesos/pkg/assert/assert.go create mode 100644 contrib/mesos/pkg/assert/doc.go create mode 100644 contrib/mesos/pkg/backoff/backoff.go create mode 100644 contrib/mesos/pkg/backoff/doc.go create mode 100644 contrib/mesos/pkg/election/doc.go create mode 100644 contrib/mesos/pkg/election/etcd_master.go create mode 100644 contrib/mesos/pkg/election/etcd_master_test.go create mode 100644 contrib/mesos/pkg/election/fake.go create mode 100644 contrib/mesos/pkg/election/master.go create mode 100644 contrib/mesos/pkg/election/master_test.go create mode 100644 contrib/mesos/pkg/executor/config/config.go create mode 100644 contrib/mesos/pkg/executor/config/doc.go create mode 100644 contrib/mesos/pkg/executor/doc.go create mode 100644 contrib/mesos/pkg/executor/executor.go create mode 100644 contrib/mesos/pkg/executor/executor_test.go create mode 100644 contrib/mesos/pkg/executor/messages/doc.go create mode 100644 contrib/mesos/pkg/executor/messages/messages.go create mode 100644 contrib/mesos/pkg/executor/mock_test.go create mode 100644 contrib/mesos/pkg/executor/service/doc.go create mode 100644 contrib/mesos/pkg/executor/service/service.go create mode 100644 contrib/mesos/pkg/hyperkube/doc.go create mode 100644 contrib/mesos/pkg/hyperkube/types.go create mode 100644 contrib/mesos/pkg/offers/doc.go create mode 100644 contrib/mesos/pkg/offers/metrics/doc.go create mode 100644 contrib/mesos/pkg/offers/metrics/metrics.go create mode 100644 contrib/mesos/pkg/offers/offers.go create mode 100644 contrib/mesos/pkg/offers/offers_test.go create mode 100644 contrib/mesos/pkg/proc/doc.go create mode 100644 contrib/mesos/pkg/proc/errors.go create mode 100644 contrib/mesos/pkg/proc/proc.go create mode 100644 contrib/mesos/pkg/proc/proc_test.go create mode 100644 contrib/mesos/pkg/proc/state.go create mode 100644 contrib/mesos/pkg/proc/types.go create mode 100644 contrib/mesos/pkg/profile/doc.go create mode 100644 contrib/mesos/pkg/profile/profile.go create mode 100644 contrib/mesos/pkg/queue/delay.go create mode 100644 contrib/mesos/pkg/queue/delay_test.go create mode 100644 contrib/mesos/pkg/queue/doc.go create mode 100644 contrib/mesos/pkg/queue/historical.go create mode 100644 contrib/mesos/pkg/queue/historical_test.go create mode 100644 contrib/mesos/pkg/queue/interface.go create mode 100644 contrib/mesos/pkg/queue/policy.go create mode 100644 contrib/mesos/pkg/queue/priority.go create mode 100644 contrib/mesos/pkg/redirfd/doc.go create mode 100644 contrib/mesos/pkg/redirfd/file_descriptor.go create mode 100644 contrib/mesos/pkg/redirfd/file_descriptor_test.go create mode 100644 contrib/mesos/pkg/redirfd/redirfd_unix.go create mode 100644 contrib/mesos/pkg/redirfd/redirfd_windows.go create mode 100644 contrib/mesos/pkg/runtime/doc.go create mode 100644 contrib/mesos/pkg/runtime/latch.go create mode 100644 contrib/mesos/pkg/runtime/latch_test.go create mode 100644 contrib/mesos/pkg/runtime/metrics.go create mode 100644 contrib/mesos/pkg/runtime/util.go create mode 100644 contrib/mesos/pkg/runtime/util_test.go create mode 100644 contrib/mesos/pkg/scheduler/config/config.go create mode 100644 contrib/mesos/pkg/scheduler/config/config_test.go create mode 100644 contrib/mesos/pkg/scheduler/config/doc.go create mode 100644 contrib/mesos/pkg/scheduler/constraint/constraint.go create mode 100644 contrib/mesos/pkg/scheduler/constraint/constraint_test.go create mode 100644 contrib/mesos/pkg/scheduler/constraint/doc.go create mode 100644 contrib/mesos/pkg/scheduler/doc.go create mode 100644 contrib/mesos/pkg/scheduler/fcfs.go create mode 100644 contrib/mesos/pkg/scheduler/ha/doc.go create mode 100644 contrib/mesos/pkg/scheduler/ha/election.go create mode 100644 contrib/mesos/pkg/scheduler/ha/ha.go create mode 100644 contrib/mesos/pkg/scheduler/meta/annotations.go create mode 100644 contrib/mesos/pkg/scheduler/meta/doc.go create mode 100644 contrib/mesos/pkg/scheduler/meta/store.go create mode 100644 contrib/mesos/pkg/scheduler/metrics/doc.go create mode 100644 contrib/mesos/pkg/scheduler/metrics/metrics.go create mode 100644 contrib/mesos/pkg/scheduler/mock_test.go create mode 100644 contrib/mesos/pkg/scheduler/plugin.go create mode 100644 contrib/mesos/pkg/scheduler/plugin_test.go create mode 100644 contrib/mesos/pkg/scheduler/pod.go create mode 100644 contrib/mesos/pkg/scheduler/podtask/debug.go create mode 100644 contrib/mesos/pkg/scheduler/podtask/doc.go create mode 100644 contrib/mesos/pkg/scheduler/podtask/leaky.go create mode 100644 contrib/mesos/pkg/scheduler/podtask/pod_task.go create mode 100644 contrib/mesos/pkg/scheduler/podtask/pod_task_test.go create mode 100644 contrib/mesos/pkg/scheduler/podtask/port_mapping.go create mode 100644 contrib/mesos/pkg/scheduler/podtask/port_mapping_test.go create mode 100644 contrib/mesos/pkg/scheduler/podtask/protobuf.go create mode 100644 contrib/mesos/pkg/scheduler/podtask/registry.go create mode 100644 contrib/mesos/pkg/scheduler/podtask/registry_test.go create mode 100644 contrib/mesos/pkg/scheduler/scheduler.go create mode 100644 contrib/mesos/pkg/scheduler/scheduler_test.go create mode 100644 contrib/mesos/pkg/scheduler/service/compat_testing.go create mode 100644 contrib/mesos/pkg/scheduler/service/compat_unix.go create mode 100644 contrib/mesos/pkg/scheduler/service/compat_windows.go create mode 100644 contrib/mesos/pkg/scheduler/service/doc.go create mode 100644 contrib/mesos/pkg/scheduler/service/publish.go create mode 100644 contrib/mesos/pkg/scheduler/service/service.go create mode 100644 contrib/mesos/pkg/scheduler/service/service_test.go create mode 100644 contrib/mesos/pkg/scheduler/service/util.go create mode 100644 contrib/mesos/pkg/scheduler/types.go create mode 100644 contrib/mesos/pkg/scheduler/uid/doc.go create mode 100644 contrib/mesos/pkg/scheduler/uid/uid.go create mode 100644 contrib/mesos/pkg/scheduler/uid/uid_test.go create mode 100644 contrib/mesos/target.sh diff --git a/contrib/mesos/cmd/k8sm-executor/doc.go b/contrib/mesos/cmd/k8sm-executor/doc.go new file mode 100644 index 00000000000..2a2041eb6f6 --- /dev/null +++ b/contrib/mesos/cmd/k8sm-executor/doc.go @@ -0,0 +1,18 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// This package main implements the executable Kubernetes Mesos executor. +package main diff --git a/contrib/mesos/cmd/k8sm-executor/main.go b/contrib/mesos/cmd/k8sm-executor/main.go new file mode 100644 index 00000000000..353f6b448ab --- /dev/null +++ b/contrib/mesos/cmd/k8sm-executor/main.go @@ -0,0 +1,47 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package main + +import ( + "fmt" + "os" + "runtime" + + "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/executor/service" + "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/hyperkube" + "github.com/GoogleCloudPlatform/kubernetes/pkg/util" + "github.com/GoogleCloudPlatform/kubernetes/pkg/version/verflag" + "github.com/spf13/pflag" +) + +func main() { + runtime.GOMAXPROCS(runtime.NumCPU()) + + s := service.NewKubeletExecutorServer() + s.AddStandaloneFlags(pflag.CommandLine) + + util.InitFlags() + util.InitLogs() + defer util.FlushLogs() + + verflag.PrintAndExitIfRequested() + + if err := s.Run(hyperkube.Nil(), pflag.CommandLine.Args()); err != nil { + fmt.Fprintf(os.Stderr, err.Error()) + os.Exit(1) + } +} diff --git a/contrib/mesos/cmd/k8sm-redirfd/doc.go b/contrib/mesos/cmd/k8sm-redirfd/doc.go new file mode 100644 index 00000000000..fdfc3b67426 --- /dev/null +++ b/contrib/mesos/cmd/k8sm-redirfd/doc.go @@ -0,0 +1,21 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// This package main is used for testing the redirfd package. +// Inspired by http://skarnet.org/software/execline/redirfd.html. +// Usage: +// k8sm-redirfb [-n] [-b] {mode} {fd} {file} {prog...} +package main diff --git a/contrib/mesos/cmd/k8sm-redirfd/redirfd.go b/contrib/mesos/cmd/k8sm-redirfd/redirfd.go new file mode 100644 index 00000000000..5592b3273d5 --- /dev/null +++ b/contrib/mesos/cmd/k8sm-redirfd/redirfd.go @@ -0,0 +1,105 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package main + +import ( + "flag" + "fmt" + "os" + "os/exec" + "syscall" + + "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/redirfd" +) + +func main() { + nonblock := flag.Bool("n", false, "open file in non-blocking mode") + changemode := flag.Bool("b", false, "change mode of file after opening it: to non-blocking mode if the -n option was not given, to blocking mode if it was") + flag.Parse() + + args := flag.Args() + if len(args) < 4 { + fmt.Fprintf(os.Stderr, "expected {mode} {fd} {file} instead of: %v\n", args) + os.Exit(1) + } + + var mode redirfd.RedirectMode + switch m := args[0]; m { + case "r": + mode = redirfd.Read + case "w": + mode = redirfd.Write + case "u": + mode = redirfd.Update + case "a": + mode = redirfd.Append + case "c": + mode = redirfd.AppendExisting + case "x": + mode = redirfd.WriteNew + default: + fmt.Fprintf(os.Stderr, "unrecognized mode %q\n", mode) + os.Exit(1) + } + + fd, err := redirfd.ParseFileDescriptor(args[1]) + if err != nil { + fmt.Fprintf(os.Stderr, "failed to parse file descriptor: %v\n", err) + os.Exit(1) + } + file := args[2] + + f, err := mode.Redirect(*nonblock, *changemode, fd, file) + if err != nil { + fmt.Fprintf(os.Stderr, "redirect failed: %q, %v\n", args[1], err) + os.Exit(1) + } + var pargs []string + if len(args) > 4 { + pargs = args[4:] + } + cmd := exec.Command(args[3], pargs...) + cmd.Stdin = os.Stdin + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + switch fd { + case redirfd.Stdin: + cmd.Stdin = f + case redirfd.Stdout: + cmd.Stdout = f + case redirfd.Stderr: + cmd.Stderr = f + default: + cmd.ExtraFiles = []*os.File{f} + } + defer f.Close() + if err = cmd.Run(); err != nil { + exiterr := err.(*exec.ExitError) + state := exiterr.ProcessState + if state != nil { + sys := state.Sys() + if waitStatus, ok := sys.(syscall.WaitStatus); ok { + if waitStatus.Signaled() { + os.Exit(256 + int(waitStatus.Signal())) + } else { + os.Exit(waitStatus.ExitStatus()) + } + } + } + os.Exit(3) + } +} diff --git a/contrib/mesos/cmd/k8sm-scheduler/doc.go b/contrib/mesos/cmd/k8sm-scheduler/doc.go new file mode 100644 index 00000000000..68e44de0b54 --- /dev/null +++ b/contrib/mesos/cmd/k8sm-scheduler/doc.go @@ -0,0 +1,18 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// This package main implements the executable Kubernetes Mesos scheduler. +package main diff --git a/contrib/mesos/cmd/k8sm-scheduler/main.go b/contrib/mesos/cmd/k8sm-scheduler/main.go new file mode 100644 index 00000000000..8ff4a987bb7 --- /dev/null +++ b/contrib/mesos/cmd/k8sm-scheduler/main.go @@ -0,0 +1,46 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package main + +import ( + "fmt" + "os" + "runtime" + + "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/hyperkube" + "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/service" + "github.com/GoogleCloudPlatform/kubernetes/pkg/util" + "github.com/GoogleCloudPlatform/kubernetes/pkg/version/verflag" + "github.com/spf13/pflag" +) + +func main() { + runtime.GOMAXPROCS(runtime.NumCPU()) + s := service.NewSchedulerServer() + s.AddStandaloneFlags(pflag.CommandLine) + + util.InitFlags() + util.InitLogs() + defer util.FlushLogs() + + verflag.PrintAndExitIfRequested() + + if err := s.Run(hyperkube.Nil(), pflag.CommandLine.Args()); err != nil { + fmt.Fprintf(os.Stderr, err.Error()) + os.Exit(1) + } +} diff --git a/contrib/mesos/pkg/assert/assert.go b/contrib/mesos/pkg/assert/assert.go new file mode 100644 index 00000000000..dd716465392 --- /dev/null +++ b/contrib/mesos/pkg/assert/assert.go @@ -0,0 +1,43 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package assert + +import ( + "testing" + "time" + + "github.com/stretchr/testify/assert" +) + +// EventuallyTrue asserts that the given predicate becomes true within the given timeout. It +// checks the predicate regularly each 100ms. +func EventuallyTrue(t *testing.T, timeout time.Duration, fn func() bool, msgAndArgs ...interface{}) bool { + start := time.Now() + for { + if fn() { + return true + } + if time.Now().Sub(start) > timeout { + if len(msgAndArgs) > 0 { + return assert.Fail(t, msgAndArgs[0].(string), msgAndArgs[1:]...) + } else { + return assert.Fail(t, "predicate fn has not been true after %v", timeout.String()) + } + } + time.Sleep(100 * time.Millisecond) + } +} diff --git a/contrib/mesos/pkg/assert/doc.go b/contrib/mesos/pkg/assert/doc.go new file mode 100644 index 00000000000..3fb556cecc2 --- /dev/null +++ b/contrib/mesos/pkg/assert/doc.go @@ -0,0 +1,19 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package assert is an utility package containing reusable testing functionality +// extending github.com/stretchr/testify/assert +package assert diff --git a/contrib/mesos/pkg/backoff/backoff.go b/contrib/mesos/pkg/backoff/backoff.go new file mode 100644 index 00000000000..f2b12b26027 --- /dev/null +++ b/contrib/mesos/pkg/backoff/backoff.go @@ -0,0 +1,96 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package backoff + +import ( + "math/rand" + "sync" + "time" + + log "github.com/golang/glog" +) + +type clock interface { + Now() time.Time +} + +type realClock struct{} + +func (realClock) Now() time.Time { + return time.Now() +} + +type backoffEntry struct { + backoff time.Duration + lastUpdate time.Time +} + +type Backoff struct { + perItemBackoff map[string]*backoffEntry + lock sync.Mutex + clock clock + defaultDuration time.Duration + maxDuration time.Duration +} + +func New(initial, max time.Duration) *Backoff { + return &Backoff{ + perItemBackoff: map[string]*backoffEntry{}, + clock: realClock{}, + defaultDuration: initial, + maxDuration: max, + } +} + +func (p *Backoff) getEntry(id string) *backoffEntry { + p.lock.Lock() + defer p.lock.Unlock() + entry, ok := p.perItemBackoff[id] + if !ok { + entry = &backoffEntry{backoff: p.defaultDuration} + p.perItemBackoff[id] = entry + } + entry.lastUpdate = p.clock.Now() + return entry +} + +func (p *Backoff) Get(id string) time.Duration { + entry := p.getEntry(id) + duration := entry.backoff + entry.backoff *= 2 + if entry.backoff > p.maxDuration { + entry.backoff = p.maxDuration + } + //TODO(jdef) parameterize use of jitter? + // add jitter, get better backoff distribution + duration = time.Duration(rand.Int63n(int64(duration))) + log.V(3).Infof("Backing off %v for pod %s", duration, id) + return duration +} + +// Garbage collect records that have aged past maxDuration. Backoff users are expected +// to invoke this periodically. +func (p *Backoff) GC() { + p.lock.Lock() + defer p.lock.Unlock() + now := p.clock.Now() + for id, entry := range p.perItemBackoff { + if now.Sub(entry.lastUpdate) > p.maxDuration { + delete(p.perItemBackoff, id) + } + } +} diff --git a/contrib/mesos/pkg/backoff/doc.go b/contrib/mesos/pkg/backoff/doc.go new file mode 100644 index 00000000000..1bd98a2617d --- /dev/null +++ b/contrib/mesos/pkg/backoff/doc.go @@ -0,0 +1,19 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package backoff provides backoff functionality with a simple API. +// Originally copied from Kubernetes: plugin/pkg/scheduler/factory/factory.go +package backoff diff --git a/contrib/mesos/pkg/election/doc.go b/contrib/mesos/pkg/election/doc.go new file mode 100644 index 00000000000..35bbe4e142d --- /dev/null +++ b/contrib/mesos/pkg/election/doc.go @@ -0,0 +1,18 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package election provides interfaces used for master election. +package election diff --git a/contrib/mesos/pkg/election/etcd_master.go b/contrib/mesos/pkg/election/etcd_master.go new file mode 100644 index 00000000000..17f4d71fd80 --- /dev/null +++ b/contrib/mesos/pkg/election/etcd_master.go @@ -0,0 +1,185 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package election + +import ( + "fmt" + "time" + + "github.com/GoogleCloudPlatform/kubernetes/pkg/tools" + "github.com/GoogleCloudPlatform/kubernetes/pkg/util" + "github.com/GoogleCloudPlatform/kubernetes/pkg/watch" + "github.com/coreos/go-etcd/etcd" + "github.com/golang/glog" +) + +// Master is used to announce the current elected master. +type Master string + +// IsAnAPIObject is used solely so we can work with the watch package. +// TODO(k8s): Either fix watch so this isn't necessary, or make this a real API Object. +// TODO(k8s): when it becomes clear how this package will be used, move these declarations to +// to the proper place. +func (Master) IsAnAPIObject() {} + +// NewEtcdMasterElector returns an implementation of election.MasterElector backed by etcd. +func NewEtcdMasterElector(h tools.EtcdGetSet) MasterElector { + return &etcdMasterElector{etcd: h} +} + +type empty struct{} + +// internal implementation struct +type etcdMasterElector struct { + etcd tools.EtcdGetSet + done chan empty + events chan watch.Event +} + +// Elect implements the election.MasterElector interface. +func (e *etcdMasterElector) Elect(path, id string) watch.Interface { + e.done = make(chan empty) + e.events = make(chan watch.Event) + go util.Forever(func() { e.run(path, id) }, time.Second*5) + return e +} + +func (e *etcdMasterElector) run(path, id string) { + masters := make(chan string) + errors := make(chan error) + go e.master(path, id, 30, masters, errors, e.done) // TODO(jdef) extract constant + for { + select { + case m := <-masters: + e.events <- watch.Event{ + Type: watch.Modified, + Object: Master(m), + } + case e := <-errors: + glog.Errorf("error in election: %v", e) + } + } +} + +// ResultChan implements the watch.Interface interface. +func (e *etcdMasterElector) ResultChan() <-chan watch.Event { + return e.events +} + +// extendMaster attempts to extend ownership of a master lock for TTL seconds. +// returns "", nil if extension failed +// returns id, nil if extension succeeded +// returns "", err if an error occurred +func (e *etcdMasterElector) extendMaster(path, id string, ttl uint64, res *etcd.Response) (string, error) { + // If it matches the passed in id, extend the lease by writing a new entry. + // Uses compare and swap, so that if we TTL out in the meantime, the write will fail. + // We don't handle the TTL delete w/o a write case here, it's handled in the next loop + // iteration. + _, err := e.etcd.CompareAndSwap(path, id, ttl, "", res.Node.ModifiedIndex) + if err != nil && !tools.IsEtcdTestFailed(err) { + return "", err + } + if err != nil && tools.IsEtcdTestFailed(err) { + return "", nil + } + return id, nil +} + +// becomeMaster attempts to become the master for this lock. +// returns "", nil if the attempt failed +// returns id, nil if the attempt succeeded +// returns "", err if an error occurred +func (e *etcdMasterElector) becomeMaster(path, id string, ttl uint64) (string, error) { + _, err := e.etcd.Create(path, id, ttl) + if err != nil && !tools.IsEtcdNodeExist(err) { + // unexpected error + return "", err + } + if err != nil && tools.IsEtcdNodeExist(err) { + return "", nil + } + return id, nil +} + +// handleMaster performs one loop of master locking. +// on success it returns , nil +// on error it returns "", err +// in situations where you should try again due to concurrent state changes (e.g. another actor simultaneously acquiring the lock) +// it returns "", nil +func (e *etcdMasterElector) handleMaster(path, id string, ttl uint64) (string, error) { + res, err := e.etcd.Get(path, false, false) + + // Unexpected error, bail out + if err != nil && !tools.IsEtcdNotFound(err) { + return "", err + } + + // There is no master, try to become the master. + if err != nil && tools.IsEtcdNotFound(err) { + return e.becomeMaster(path, id, ttl) + } + + // This should never happen. + if res.Node == nil { + return "", fmt.Errorf("unexpected response: %#v", res) + } + + // We're not the master, just return the current value + if res.Node.Value != id { + return res.Node.Value, nil + } + + // We are the master, try to extend out lease + return e.extendMaster(path, id, ttl, res) +} + +// master provices a distributed master election lock, maintains lock until failure, or someone sends something in the done channel. +// The basic algorithm is: +// while !done +// Get the current master +// If there is no current master +// Try to become the master +// Otherwise +// If we are the master, extend the lease +// If the master is different than the last time through the loop, report the master +// Sleep 80% of TTL +func (e *etcdMasterElector) master(path, id string, ttl uint64, masters chan<- string, errors chan<- error, done <-chan empty) { + lastMaster := "" + for { + master, err := e.handleMaster(path, id, ttl) + if err != nil { + errors <- err + } else if len(master) == 0 { + continue + } else if master != lastMaster { + lastMaster = master + masters <- master + } + // TODO(k8s): Add Watch here, skip the polling for faster reactions + // If done is closed, break out. + select { + case <-done: + return + case <-time.After(time.Duration((ttl*8)/10) * time.Second): + } + } +} + +// ResultChan implements the watch.Interface interface +func (e *etcdMasterElector) Stop() { + close(e.done) +} diff --git a/contrib/mesos/pkg/election/etcd_master_test.go b/contrib/mesos/pkg/election/etcd_master_test.go new file mode 100644 index 00000000000..9facd532411 --- /dev/null +++ b/contrib/mesos/pkg/election/etcd_master_test.go @@ -0,0 +1,98 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package election + +import ( + "testing" + + "github.com/GoogleCloudPlatform/kubernetes/pkg/tools" + "github.com/GoogleCloudPlatform/kubernetes/pkg/watch" + "github.com/coreos/go-etcd/etcd" +) + +func TestEtcdMasterOther(t *testing.T) { + path := "foo" + etcd := tools.NewFakeEtcdClient(t) + etcd.Set(path, "baz", 0) + master := NewEtcdMasterElector(etcd) + w := master.Elect(path, "bar") + result := <-w.ResultChan() + if result.Type != watch.Modified || result.Object.(Master) != "baz" { + t.Errorf("unexpected event: %#v", result) + } + w.Stop() +} + +func TestEtcdMasterNoOther(t *testing.T) { + path := "foo" + e := tools.NewFakeEtcdClient(t) + e.TestIndex = true + e.Data["foo"] = tools.EtcdResponseWithError{ + R: &etcd.Response{ + Node: nil, + }, + E: &etcd.EtcdError{ + ErrorCode: tools.EtcdErrorCodeNotFound, + }, + } + master := NewEtcdMasterElector(e) + w := master.Elect(path, "bar") + result := <-w.ResultChan() + if result.Type != watch.Modified || result.Object.(Master) != "bar" { + t.Errorf("unexpected event: %#v", result) + } + w.Stop() +} + +func TestEtcdMasterNoOtherThenConflict(t *testing.T) { + path := "foo" + e := tools.NewFakeEtcdClient(t) + e.TestIndex = true + // Ok, so we set up a chain of responses from etcd: + // 1) Nothing there + // 2) conflict (someone else wrote) + // 3) new value (the data they wrote) + empty := tools.EtcdResponseWithError{ + R: &etcd.Response{ + Node: nil, + }, + E: &etcd.EtcdError{ + ErrorCode: tools.EtcdErrorCodeNotFound, + }, + } + empty.N = &tools.EtcdResponseWithError{ + R: &etcd.Response{}, + E: &etcd.EtcdError{ + ErrorCode: tools.EtcdErrorCodeNodeExist, + }, + } + empty.N.N = &tools.EtcdResponseWithError{ + R: &etcd.Response{ + Node: &etcd.Node{ + Value: "baz", + }, + }, + } + e.Data["foo"] = empty + master := NewEtcdMasterElector(e) + w := master.Elect(path, "bar") + result := <-w.ResultChan() + if result.Type != watch.Modified || result.Object.(Master) != "bar" { + t.Errorf("unexpected event: %#v", result) + } + w.Stop() +} diff --git a/contrib/mesos/pkg/election/fake.go b/contrib/mesos/pkg/election/fake.go new file mode 100644 index 00000000000..d4eaddfb3ec --- /dev/null +++ b/contrib/mesos/pkg/election/fake.go @@ -0,0 +1,53 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package election + +import ( + "sync" + + "github.com/GoogleCloudPlatform/kubernetes/pkg/watch" +) + +// Fake allows for testing of anything consuming a MasterElector. +type Fake struct { + mux *watch.Broadcaster + currentMaster Master + lock sync.Mutex // Protect access of currentMaster +} + +// NewFake makes a new fake MasterElector. +func NewFake() *Fake { + // 0 means block for clients. + return &Fake{mux: watch.NewBroadcaster(0, watch.WaitIfChannelFull)} +} + +func (f *Fake) ChangeMaster(newMaster Master) { + f.lock.Lock() + defer f.lock.Unlock() + f.mux.Action(watch.Modified, newMaster) + f.currentMaster = newMaster +} + +func (f *Fake) Elect(path, id string) watch.Interface { + f.lock.Lock() + defer f.lock.Unlock() + w := f.mux.Watch() + if f.currentMaster != "" { + f.mux.Action(watch.Modified, f.currentMaster) + } + return w +} diff --git a/contrib/mesos/pkg/election/master.go b/contrib/mesos/pkg/election/master.go new file mode 100644 index 00000000000..d5f1a76a7ca --- /dev/null +++ b/contrib/mesos/pkg/election/master.go @@ -0,0 +1,134 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package election + +import ( + "sync" + + "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/runtime" + "github.com/GoogleCloudPlatform/kubernetes/pkg/watch" + + "github.com/golang/glog" +) + +// MasterElector is an interface for services that can elect masters. +// Important Note: MasterElectors are not inter-operable, all participants in the election need to be +// using the same underlying implementation of this interface for correct behavior. +type MasterElector interface { + // Elect makes the caller represented by 'id' enter into a master election for the + // distributed lock defined by 'path' + // The returned watch.Interface provides a stream of Master objects which + // contain the current master. + // Calling Stop on the returned interface relinquishes ownership (if currently possesed) + // and removes the caller from the election + Elect(path, id string) watch.Interface +} + +// Service represents anything that can start and stop on demand. +type Service interface { + Validate(desired, current Master) + Start() + Stop() +} + +type notifier struct { + lock sync.Mutex + cond *sync.Cond + + // desired is updated with every change, current is updated after + // Start()/Stop() finishes. 'cond' is used to signal that a change + // might be needed. This handles the case where mastership flops + // around without calling Start()/Stop() excessively. + desired, current Master + + // for comparison, to see if we are master. + id Master + + service Service +} + +// Notify runs Elect() on m, and calls Start()/Stop() on s when the +// elected master starts/stops matching 'id'. Never returns. +func Notify(m MasterElector, path, id string, s Service, abort <-chan struct{}) { + n := ¬ifier{id: Master(id), service: s} + n.cond = sync.NewCond(&n.lock) + finished := runtime.After(func() { + runtime.Until(func() { + for { + w := m.Elect(path, id) + for { + select { + case <-abort: + return + case event, open := <-w.ResultChan(): + if !open { + break + } + if event.Type != watch.Modified { + continue + } + electedMaster, ok := event.Object.(Master) + if !ok { + glog.Errorf("Unexpected object from election channel: %v", event.Object) + break + } + func() { + n.lock.Lock() + defer n.lock.Unlock() + n.desired = electedMaster + if n.desired != n.current { + n.cond.Signal() + } + }() + } + } + } + }, 0, abort) + }) + runtime.Until(func() { n.serviceLoop(finished) }, 0, abort) +} + +// serviceLoop waits for changes, and calls Start()/Stop() as needed. +func (n *notifier) serviceLoop(abort <-chan struct{}) { + n.lock.Lock() + defer n.lock.Unlock() + for { + select { + case <-abort: + return + default: + for n.desired == n.current { + ch := runtime.After(n.cond.Wait) + select { + case <-abort: + n.cond.Signal() // ensure that Wait() returns + <-ch + return + case <-ch: + // we were notified and have the lock, proceed.. + } + } + if n.current != n.id && n.desired == n.id { + n.service.Validate(n.desired, n.current) + n.service.Start() + } else if n.current == n.id && n.desired != n.id { + n.service.Stop() + } + n.current = n.desired + } + } +} diff --git a/contrib/mesos/pkg/election/master_test.go b/contrib/mesos/pkg/election/master_test.go new file mode 100644 index 00000000000..5584ab25fae --- /dev/null +++ b/contrib/mesos/pkg/election/master_test.go @@ -0,0 +1,98 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package election + +import ( + "testing" + "time" + + "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/runtime" +) + +type slowService struct { + t *testing.T + on bool + // We explicitly have no lock to prove that + // Start and Stop are not called concurrently. + changes chan<- bool + done <-chan struct{} +} + +func (s *slowService) Validate(d, c Master) { + // noop +} + +func (s *slowService) Start() { + select { + case <-s.done: + return // avoid writing to closed changes chan + default: + } + if s.on { + s.t.Errorf("started already on service") + } + time.Sleep(2 * time.Millisecond) + s.on = true + s.changes <- true +} + +func (s *slowService) Stop() { + select { + case <-s.done: + return // avoid writing to closed changes chan + default: + } + if !s.on { + s.t.Errorf("stopped already off service") + } + time.Sleep(2 * time.Millisecond) + s.on = false + s.changes <- false +} + +func Test(t *testing.T) { + m := NewFake() + changes := make(chan bool, 1500) + done := make(chan struct{}) + s := &slowService{t: t, changes: changes, done: done} + notifyDone := runtime.After(func() { Notify(m, "", "me", s, done) }) + + go func() { + defer close(done) + for i := 0; i < 500; i++ { + for _, key := range []string{"me", "notme", "alsonotme"} { + m.ChangeMaster(Master(key)) + } + } + }() + + <-notifyDone + close(changes) + + changeList := []bool{} + for { + change, ok := <-changes + if !ok { + break + } + changeList = append(changeList, change) + } + + if len(changeList) > 1000 { + t.Errorf("unexpected number of changes: %v", len(changeList)) + } +} diff --git a/contrib/mesos/pkg/executor/config/config.go b/contrib/mesos/pkg/executor/config/config.go new file mode 100644 index 00000000000..999058dbc8b --- /dev/null +++ b/contrib/mesos/pkg/executor/config/config.go @@ -0,0 +1,29 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package config + +import ( + "time" +) + +// default values to use when constructing mesos ExecutorInfo messages +const ( + DefaultInfoID = "k8sm-executor" + DefaultInfoSource = "kubernetes" + DefaultInfoName = "Kubelet-Executor" + DefaultSuicideTimeout = 20 * time.Minute +) diff --git a/contrib/mesos/pkg/executor/config/doc.go b/contrib/mesos/pkg/executor/config/doc.go new file mode 100644 index 00000000000..7a44f3e7b5f --- /dev/null +++ b/contrib/mesos/pkg/executor/config/doc.go @@ -0,0 +1,18 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package config contains executor configuration constants. +package config diff --git a/contrib/mesos/pkg/executor/doc.go b/contrib/mesos/pkg/executor/doc.go new file mode 100644 index 00000000000..5ac5e9d8f82 --- /dev/null +++ b/contrib/mesos/pkg/executor/doc.go @@ -0,0 +1,21 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +/* +Package executor includes a mesos executor, which contains +a kubelet as its member to manage containers. +*/ +package executor diff --git a/contrib/mesos/pkg/executor/executor.go b/contrib/mesos/pkg/executor/executor.go new file mode 100644 index 00000000000..c9a93233e76 --- /dev/null +++ b/contrib/mesos/pkg/executor/executor.go @@ -0,0 +1,846 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package executor + +import ( + "encoding/json" + "fmt" + "strings" + "sync" + "sync/atomic" + "time" + + "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/executor/messages" + "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/meta" + "github.com/GoogleCloudPlatform/kubernetes/pkg/api" + "github.com/GoogleCloudPlatform/kubernetes/pkg/client" + "github.com/GoogleCloudPlatform/kubernetes/pkg/kubelet" + "github.com/GoogleCloudPlatform/kubernetes/pkg/kubelet/container" + "github.com/GoogleCloudPlatform/kubernetes/pkg/kubelet/dockertools" + "github.com/GoogleCloudPlatform/kubernetes/pkg/util" + "github.com/GoogleCloudPlatform/kubernetes/pkg/watch" + "github.com/fsouza/go-dockerclient" + "github.com/gogo/protobuf/proto" + log "github.com/golang/glog" + bindings "github.com/mesos/mesos-go/executor" + mesos "github.com/mesos/mesos-go/mesosproto" + mutil "github.com/mesos/mesos-go/mesosutil" +) + +const ( + containerPollTime = 300 * time.Millisecond + launchGracePeriod = 5 * time.Minute +) + +type stateType int32 + +const ( + disconnectedState stateType = iota + connectedState + suicidalState + terminalState +) + +func (s *stateType) get() stateType { + return stateType(atomic.LoadInt32((*int32)(s))) +} + +func (s *stateType) transition(from, to stateType) bool { + return atomic.CompareAndSwapInt32((*int32)(s), int32(from), int32(to)) +} + +func (s *stateType) transitionTo(to stateType, unless ...stateType) bool { + if len(unless) == 0 { + atomic.StoreInt32((*int32)(s), int32(to)) + return true + } + for { + state := s.get() + for _, x := range unless { + if state == x { + return false + } + } + if s.transition(state, to) { + return true + } + } +} + +type kuberTask struct { + mesosTaskInfo *mesos.TaskInfo + podName string +} + +// func that attempts suicide +type jumper func(bindings.ExecutorDriver, <-chan struct{}) + +type suicideWatcher interface { + Next(time.Duration, bindings.ExecutorDriver, jumper) suicideWatcher + Reset(time.Duration) bool + Stop() bool +} + +type podStatusFunc func() (*api.PodStatus, error) + +// KubernetesExecutor is an mesos executor that runs pods +// in a minion machine. +type KubernetesExecutor struct { + kl *kubelet.Kubelet // the kubelet instance. + updateChan chan<- interface{} // to send pod config updates to the kubelet + state stateType + tasks map[string]*kuberTask + pods map[string]*api.Pod + lock sync.RWMutex + sourcename string + client *client.Client + events <-chan watch.Event + done chan struct{} // signals shutdown + outgoing chan func() (mesos.Status, error) // outgoing queue to the mesos driver + dockerClient dockertools.DockerInterface + suicideWatch suicideWatcher + suicideTimeout time.Duration + shutdownAlert func() // invoked just prior to executor shutdown + kubeletFinished <-chan struct{} // signals that kubelet Run() died + initialRegistration sync.Once + exitFunc func(int) + podStatusFunc func(*kubelet.Kubelet, *api.Pod) (*api.PodStatus, error) +} + +type Config struct { + Kubelet *kubelet.Kubelet + Updates chan<- interface{} // to send pod config updates to the kubelet + SourceName string + APIClient *client.Client + Watch watch.Interface + Docker dockertools.DockerInterface + ShutdownAlert func() + SuicideTimeout time.Duration + KubeletFinished <-chan struct{} // signals that kubelet Run() died + ExitFunc func(int) + PodStatusFunc func(*kubelet.Kubelet, *api.Pod) (*api.PodStatus, error) +} + +func (k *KubernetesExecutor) isConnected() bool { + return connectedState == (&k.state).get() +} + +// New creates a new kubernetes executor. +func New(config Config) *KubernetesExecutor { + k := &KubernetesExecutor{ + kl: config.Kubelet, + updateChan: config.Updates, + state: disconnectedState, + tasks: make(map[string]*kuberTask), + pods: make(map[string]*api.Pod), + sourcename: config.SourceName, + client: config.APIClient, + done: make(chan struct{}), + outgoing: make(chan func() (mesos.Status, error), 1024), + dockerClient: config.Docker, + suicideTimeout: config.SuicideTimeout, + kubeletFinished: config.KubeletFinished, + suicideWatch: &suicideTimer{}, + shutdownAlert: config.ShutdownAlert, + exitFunc: config.ExitFunc, + podStatusFunc: config.PodStatusFunc, + } + //TODO(jdef) do something real with these events.. + if config.Watch != nil { + events := config.Watch.ResultChan() + if events != nil { + go func() { + for e := range events { + // e ~= watch.Event { ADDED, *api.Event } + log.V(1).Info(e) + } + }() + k.events = events + } + } + return k +} + +func (k *KubernetesExecutor) Init(driver bindings.ExecutorDriver) { + k.killKubeletContainers() + k.resetSuicideWatch(driver) + go k.sendLoop() + //TODO(jdef) monitor kubeletFinished and shutdown if it happens +} + +func (k *KubernetesExecutor) Done() <-chan struct{} { + return k.done +} + +func (k *KubernetesExecutor) isDone() bool { + select { + case <-k.done: + return true + default: + return false + } +} + +// Registered is called when the executor is successfully registered with the slave. +func (k *KubernetesExecutor) Registered(driver bindings.ExecutorDriver, + executorInfo *mesos.ExecutorInfo, frameworkInfo *mesos.FrameworkInfo, slaveInfo *mesos.SlaveInfo) { + if k.isDone() { + return + } + log.Infof("Executor %v of framework %v registered with slave %v\n", + executorInfo, frameworkInfo, slaveInfo) + if !(&k.state).transition(disconnectedState, connectedState) { + log.Errorf("failed to register/transition to a connected state") + } + k.initialRegistration.Do(k.onInitialRegistration) +} + +// Reregistered is called when the executor is successfully re-registered with the slave. +// This can happen when the slave fails over. +func (k *KubernetesExecutor) Reregistered(driver bindings.ExecutorDriver, slaveInfo *mesos.SlaveInfo) { + if k.isDone() { + return + } + log.Infof("Reregistered with slave %v\n", slaveInfo) + if !(&k.state).transition(disconnectedState, connectedState) { + log.Errorf("failed to reregister/transition to a connected state") + } + k.initialRegistration.Do(k.onInitialRegistration) +} + +func (k *KubernetesExecutor) onInitialRegistration() { + // emit an empty update to allow the mesos "source" to be marked as seen + k.updateChan <- kubelet.PodUpdate{ + Pods: []*api.Pod{}, + Op: kubelet.SET, + Source: k.sourcename, + } +} + +// Disconnected is called when the executor is disconnected from the slave. +func (k *KubernetesExecutor) Disconnected(driver bindings.ExecutorDriver) { + if k.isDone() { + return + } + log.Infof("Slave is disconnected\n") + if !(&k.state).transition(connectedState, disconnectedState) { + log.Errorf("failed to disconnect/transition to a disconnected state") + } +} + +// LaunchTask is called when the executor receives a request to launch a task. +// The happens when the k8sm scheduler has decided to schedule the pod +// (which corresponds to a Mesos Task) onto the node where this executor +// is running, but the binding is not recorded in the Kubernetes store yet. +// This function is invoked to tell the executor to record the binding in the +// Kubernetes store and start the pod via the Kubelet. +func (k *KubernetesExecutor) LaunchTask(driver bindings.ExecutorDriver, taskInfo *mesos.TaskInfo) { + if k.isDone() { + return + } + log.Infof("Launch task %v\n", taskInfo) + + if !k.isConnected() { + log.Errorf("Ignore launch task because the executor is disconnected\n") + k.sendStatus(driver, newStatus(taskInfo.GetTaskId(), mesos.TaskState_TASK_FAILED, + messages.ExecutorUnregistered)) + return + } + + obj, err := api.Codec.Decode(taskInfo.GetData()) + if err != nil { + log.Errorf("failed to extract yaml data from the taskInfo.data %v", err) + k.sendStatus(driver, newStatus(taskInfo.GetTaskId(), mesos.TaskState_TASK_FAILED, + messages.UnmarshalTaskDataFailure)) + return + } + pod, ok := obj.(*api.Pod) + if !ok { + log.Errorf("expected *api.Pod instead of %T: %+v", pod, pod) + k.sendStatus(driver, newStatus(taskInfo.GetTaskId(), mesos.TaskState_TASK_FAILED, + messages.UnmarshalTaskDataFailure)) + return + } + + k.lock.Lock() + defer k.lock.Unlock() + + taskId := taskInfo.GetTaskId().GetValue() + if _, found := k.tasks[taskId]; found { + log.Errorf("task already launched\n") + // Not to send back TASK_RUNNING here, because + // may be duplicated messages or duplicated task id. + return + } + // remember this task so that: + // (a) we ignore future launches for it + // (b) we have a record of it so that we can kill it if needed + // (c) we're leaving podName == "" for now, indicates we don't need to delete containers + k.tasks[taskId] = &kuberTask{ + mesosTaskInfo: taskInfo, + } + k.resetSuicideWatch(driver) + + go k.launchTask(driver, taskId, pod) +} + +// TODO(jdef) add metrics for this? +type suicideTimer struct { + timer *time.Timer +} + +func (w *suicideTimer) Next(d time.Duration, driver bindings.ExecutorDriver, f jumper) suicideWatcher { + return &suicideTimer{ + timer: time.AfterFunc(d, func() { + log.Warningf("Suicide timeout (%v) expired", d) + f(driver, nil) + }), + } +} + +func (w *suicideTimer) Stop() (result bool) { + if w != nil && w.timer != nil { + log.Infoln("stopping suicide watch") //TODO(jdef) debug + result = w.timer.Stop() + } + return +} + +// return true if the timer was successfully reset +func (w *suicideTimer) Reset(d time.Duration) bool { + if w != nil && w.timer != nil { + log.Infoln("resetting suicide watch") //TODO(jdef) debug + w.timer.Reset(d) + return true + } + return false +} + +// determine whether we need to start a suicide countdown. if so, then start +// a timer that, upon expiration, causes this executor to commit suicide. +// this implementation runs asynchronously. callers that wish to wait for the +// reset to complete may wait for the returned signal chan to close. +func (k *KubernetesExecutor) resetSuicideWatch(driver bindings.ExecutorDriver) <-chan struct{} { + ch := make(chan struct{}) + go func() { + defer close(ch) + k.lock.Lock() + defer k.lock.Unlock() + + if k.suicideTimeout < 1 { + return + } + + if k.suicideWatch != nil { + if len(k.tasks) > 0 { + k.suicideWatch.Stop() + return + } + if k.suicideWatch.Reset(k.suicideTimeout) { + // valid timer, reset was successful + return + } + } + + //TODO(jdef) reduce verbosity here once we're convinced that suicide watch is working properly + log.Infof("resetting suicide watch timer for %v", k.suicideTimeout) + + k.suicideWatch = k.suicideWatch.Next(k.suicideTimeout, driver, jumper(k.attemptSuicide)) + }() + return ch +} + +func (k *KubernetesExecutor) attemptSuicide(driver bindings.ExecutorDriver, abort <-chan struct{}) { + k.lock.Lock() + defer k.lock.Unlock() + + // this attempt may have been queued and since been aborted + select { + case <-abort: + //TODO(jdef) reduce verbosity once suicide watch is working properly + log.Infof("aborting suicide attempt since watch was cancelled") + return + default: // continue + } + + // fail-safe, will abort kamikaze attempts if there are tasks + if len(k.tasks) > 0 { + ids := []string{} + for taskid := range k.tasks { + ids = append(ids, taskid) + } + log.Errorf("suicide attempt failed, there are still running tasks: %v", ids) + return + } + + log.Infoln("Attempting suicide") + if (&k.state).transitionTo(suicidalState, suicidalState, terminalState) { + //TODO(jdef) let the scheduler know? + //TODO(jdef) is suicide more graceful than slave-demanded shutdown? + k.doShutdown(driver) + } +} + +// async continuation of LaunchTask +func (k *KubernetesExecutor) launchTask(driver bindings.ExecutorDriver, taskId string, pod *api.Pod) { + + //HACK(jdef): cloned binding construction from k8s plugin/pkg/scheduler/scheduler.go + binding := &api.Binding{ + ObjectMeta: api.ObjectMeta{ + Namespace: pod.Namespace, + Name: pod.Name, + Annotations: make(map[string]string), + }, + Target: api.ObjectReference{ + Kind: "Node", + Name: pod.Annotations[meta.BindingHostKey], + }, + } + + // forward the annotations that the scheduler wants to apply + for k, v := range pod.Annotations { + binding.Annotations[k] = v + } + + deleteTask := func() { + k.lock.Lock() + defer k.lock.Unlock() + delete(k.tasks, taskId) + k.resetSuicideWatch(driver) + } + + log.Infof("Binding '%v/%v' to '%v' with annotations %+v...", pod.Namespace, pod.Name, binding.Target.Name, binding.Annotations) + ctx := api.WithNamespace(api.NewContext(), binding.Namespace) + // TODO(k8s): use Pods interface for binding once clusters are upgraded + // return b.Pods(binding.Namespace).Bind(binding) + err := k.client.Post().Namespace(api.NamespaceValue(ctx)).Resource("bindings").Body(binding).Do().Error() + if err != nil { + deleteTask() + k.sendStatus(driver, newStatus(mutil.NewTaskID(taskId), mesos.TaskState_TASK_FAILED, + messages.CreateBindingFailure)) + return + } + podFullName := container.GetPodFullName(pod) + + // allow a recently failed-over scheduler the chance to recover the task/pod binding: + // it may have failed and recovered before the apiserver is able to report the updated + // binding information. replays of this status event will signal to the scheduler that + // the apiserver should be up-to-date. + data, err := json.Marshal(api.PodStatusResult{ + ObjectMeta: api.ObjectMeta{ + Name: podFullName, + SelfLink: "/podstatusresult", + }, + }) + if err != nil { + deleteTask() + log.Errorf("failed to marshal pod status result: %v", err) + k.sendStatus(driver, newStatus(mutil.NewTaskID(taskId), mesos.TaskState_TASK_FAILED, + err.Error())) + return + } + + k.lock.Lock() + defer k.lock.Unlock() + + // Add the task. + task, found := k.tasks[taskId] + if !found { + log.V(1).Infof("task %v not found, probably killed: aborting launch, reporting lost", taskId) + k.reportLostTask(driver, taskId, messages.LaunchTaskFailed) + return + } + + //TODO(jdef) check for duplicate pod name, if found send TASK_ERROR + + // from here on, we need to delete containers associated with the task + // upon it going into a terminal state + task.podName = podFullName + k.pods[podFullName] = pod + + // send the latest snapshot of the set of pods to the kubelet via the pod update channel + update := kubelet.PodUpdate{Op: kubelet.SET} + for _, p := range k.pods { + update.Pods = append(update.Pods, p) + } + k.updateChan <- update + + statusUpdate := &mesos.TaskStatus{ + TaskId: mutil.NewTaskID(taskId), + State: mesos.TaskState_TASK_STARTING.Enum(), + Message: proto.String(messages.CreateBindingSuccess), + Data: data, + } + k.sendStatus(driver, statusUpdate) + + // Delay reporting 'task running' until container is up. + psf := podStatusFunc(func() (*api.PodStatus, error) { + return k.podStatusFunc(k.kl, pod) + }) + + go k._launchTask(driver, taskId, podFullName, psf) +} + +func (k *KubernetesExecutor) _launchTask(driver bindings.ExecutorDriver, taskId, podFullName string, psf podStatusFunc) { + + expired := make(chan struct{}) + time.AfterFunc(launchGracePeriod, func() { close(expired) }) + + getMarshalledInfo := func() (data []byte, cancel bool) { + // potentially long call.. + if podStatus, err := psf(); err == nil && podStatus != nil { + select { + case <-expired: + cancel = true + default: + k.lock.Lock() + defer k.lock.Unlock() + if _, found := k.tasks[taskId]; !found { + // don't bother with the pod status if the task is already gone + cancel = true + break + } else if podStatus.Phase != api.PodRunning { + // avoid sending back a running status before it's really running + break + } + log.V(2).Infof("Found pod status: '%v'", podStatus) + result := api.PodStatusResult{ + ObjectMeta: api.ObjectMeta{ + Name: podFullName, + SelfLink: "/podstatusresult", + }, + Status: *podStatus, + } + if data, err = json.Marshal(result); err != nil { + log.Errorf("failed to marshal pod status result: %v", err) + } + } + } + return + } + +waitForRunningPod: + for { + select { + case <-expired: + log.Warningf("Launch expired grace period of '%v'", launchGracePeriod) + break waitForRunningPod + case <-time.After(containerPollTime): + if data, cancel := getMarshalledInfo(); cancel { + break waitForRunningPod + } else if data == nil { + continue waitForRunningPod + } else { + k.lock.Lock() + defer k.lock.Unlock() + if _, found := k.tasks[taskId]; !found { + goto reportLost + } + + statusUpdate := &mesos.TaskStatus{ + TaskId: mutil.NewTaskID(taskId), + State: mesos.TaskState_TASK_RUNNING.Enum(), + Message: proto.String(fmt.Sprintf("pod-running:%s", podFullName)), + Data: data, + } + + k.sendStatus(driver, statusUpdate) + + // continue to monitor the health of the pod + go k.__launchTask(driver, taskId, podFullName, psf) + return + } + } + } + + k.lock.Lock() + defer k.lock.Unlock() +reportLost: + k.reportLostTask(driver, taskId, messages.LaunchTaskFailed) +} + +func (k *KubernetesExecutor) __launchTask(driver bindings.ExecutorDriver, taskId, podFullName string, psf podStatusFunc) { + // TODO(nnielsen): Monitor health of pod and report if lost. + // Should we also allow this to fail a couple of times before reporting lost? + // What if the docker daemon is restarting and we can't connect, but it's + // going to bring the pods back online as soon as it restarts? + knownPod := func() bool { + _, err := psf() + return err == nil + } + // Wait for the pod to go away and stop monitoring once it does + // TODO (jdefelice) replace with an /events watch? + for { + time.Sleep(containerPollTime) + if k.checkForLostPodTask(driver, taskId, knownPod) { + return + } + } +} + +// Intended to be executed as part of the pod monitoring loop, this fn (ultimately) checks with Docker +// whether the pod is running. It will only return false if the task is still registered and the pod is +// registered in Docker. Otherwise it returns true. If there's still a task record on file, but no pod +// in Docker, then we'll also send a TASK_LOST event. +func (k *KubernetesExecutor) checkForLostPodTask(driver bindings.ExecutorDriver, taskId string, isKnownPod func() bool) bool { + // TODO (jdefelice) don't send false alarms for deleted pods (KILLED tasks) + k.lock.Lock() + defer k.lock.Unlock() + + // TODO(jdef) we should really consider k.pods here, along with what docker is reporting, since the + // kubelet may constantly attempt to instantiate a pod as long as it's in the pod state that we're + // handing to it. otherwise, we're probably reporting a TASK_LOST prematurely. Should probably + // consult RestartPolicy to determine appropriate behavior. Should probably also gracefully handle + // docker daemon restarts. + if _, ok := k.tasks[taskId]; ok { + if isKnownPod() { + return false + } else { + log.Warningf("Detected lost pod, reporting lost task %v", taskId) + k.reportLostTask(driver, taskId, messages.ContainersDisappeared) + } + } else { + log.V(2).Infof("Task %v no longer registered, stop monitoring for lost pods", taskId) + } + return true +} + +// KillTask is called when the executor receives a request to kill a task. +func (k *KubernetesExecutor) KillTask(driver bindings.ExecutorDriver, taskId *mesos.TaskID) { + if k.isDone() { + return + } + log.Infof("Kill task %v\n", taskId) + + if !k.isConnected() { + //TODO(jdefelice) sent TASK_LOST here? + log.Warningf("Ignore kill task because the executor is disconnected\n") + return + } + + k.lock.Lock() + defer k.lock.Unlock() + k.removePodTask(driver, taskId.GetValue(), messages.TaskKilled, mesos.TaskState_TASK_KILLED) +} + +// Reports a lost task to the slave and updates internal task and pod tracking state. +// Assumes that the caller is locking around pod and task state. +func (k *KubernetesExecutor) reportLostTask(driver bindings.ExecutorDriver, tid, reason string) { + k.removePodTask(driver, tid, reason, mesos.TaskState_TASK_LOST) +} + +// deletes the pod and task associated with the task identified by tid and sends a task +// status update to mesos. also attempts to reset the suicide watch. +// Assumes that the caller is locking around pod and task state. +func (k *KubernetesExecutor) removePodTask(driver bindings.ExecutorDriver, tid, reason string, state mesos.TaskState) { + task, ok := k.tasks[tid] + if !ok { + log.V(1).Infof("Failed to remove task, unknown task %v\n", tid) + return + } + delete(k.tasks, tid) + k.resetSuicideWatch(driver) + + pid := task.podName + if _, found := k.pods[pid]; !found { + log.Warningf("Cannot remove unknown pod %v for task %v", pid, tid) + } else { + log.V(2).Infof("deleting pod %v for task %v", pid, tid) + delete(k.pods, pid) + + // Send the pod updates to the channel. + update := kubelet.PodUpdate{Op: kubelet.SET} + for _, p := range k.pods { + update.Pods = append(update.Pods, p) + } + k.updateChan <- update + } + // TODO(jdef): ensure that the update propagates, perhaps return a signal chan? + k.sendStatus(driver, newStatus(mutil.NewTaskID(tid), state, reason)) +} + +// FrameworkMessage is called when the framework sends some message to the executor +func (k *KubernetesExecutor) FrameworkMessage(driver bindings.ExecutorDriver, message string) { + if k.isDone() { + return + } + if !k.isConnected() { + log.Warningf("Ignore framework message because the executor is disconnected\n") + return + } + + log.Infof("Receives message from framework %v\n", message) + //TODO(jdef) master reported a lost task, reconcile this! @see scheduler.go:handleTaskLost + if strings.HasPrefix(message, "task-lost:") && len(message) > 10 { + taskId := message[10:] + if taskId != "" { + // clean up pod state + k.lock.Lock() + defer k.lock.Unlock() + k.reportLostTask(driver, taskId, messages.TaskLostAck) + } + } + + switch message { + case messages.Kamikaze: + k.attemptSuicide(driver, nil) + } +} + +// Shutdown is called when the executor receives a shutdown request. +func (k *KubernetesExecutor) Shutdown(driver bindings.ExecutorDriver) { + k.lock.Lock() + defer k.lock.Unlock() + k.doShutdown(driver) +} + +// assumes that caller has obtained state lock +func (k *KubernetesExecutor) doShutdown(driver bindings.ExecutorDriver) { + defer func() { + log.Errorf("exiting with unclean shutdown: %v", recover()) + if k.exitFunc != nil { + k.exitFunc(1) + } + }() + + (&k.state).transitionTo(terminalState) + + // signal to all listeners that this KubeletExecutor is done! + close(k.done) + + if k.shutdownAlert != nil { + func() { + util.HandleCrash() + k.shutdownAlert() + }() + } + + log.Infoln("Stopping executor driver") + _, err := driver.Stop() + if err != nil { + log.Warningf("failed to stop executor driver: %v", err) + } + + log.Infoln("Shutdown the executor") + + // according to docs, mesos will generate TASK_LOST updates for us + // if needed, so don't take extra time to do that here. + k.tasks = map[string]*kuberTask{} + + select { + // the main Run() func may still be running... wait for it to finish: it will + // clear the pod configuration cleanly, telling k8s "there are no pods" and + // clean up resources (pods, volumes, etc). + case <-k.kubeletFinished: + + //TODO(jdef) attempt to wait for events to propagate to API server? + + // TODO(jdef) extract constant, should be smaller than whatever the + // slave graceful shutdown timeout period is. + case <-time.After(15 * time.Second): + log.Errorf("timed out waiting for kubelet Run() to die") + } + + log.Infoln("exiting") + if k.exitFunc != nil { + k.exitFunc(0) + } +} + +// Destroy existing k8s containers +func (k *KubernetesExecutor) killKubeletContainers() { + if containers, err := dockertools.GetKubeletDockerContainers(k.dockerClient, true); err == nil { + opts := docker.RemoveContainerOptions{ + RemoveVolumes: true, + Force: true, + } + for _, container := range containers { + opts.ID = container.ID + log.V(2).Infof("Removing container: %v", opts.ID) + if err := k.dockerClient.RemoveContainer(opts); err != nil { + log.Warning(err) + } + } + } else { + log.Warningf("Failed to list kubelet docker containers: %v", err) + } +} + +// Error is called when some error happens. +func (k *KubernetesExecutor) Error(driver bindings.ExecutorDriver, message string) { + log.Errorln(message) +} + +func newStatus(taskId *mesos.TaskID, state mesos.TaskState, message string) *mesos.TaskStatus { + return &mesos.TaskStatus{ + TaskId: taskId, + State: &state, + Message: proto.String(message), + } +} + +func (k *KubernetesExecutor) sendStatus(driver bindings.ExecutorDriver, status *mesos.TaskStatus) { + select { + case <-k.done: + default: + k.outgoing <- func() (mesos.Status, error) { return driver.SendStatusUpdate(status) } + } +} + +func (k *KubernetesExecutor) sendFrameworkMessage(driver bindings.ExecutorDriver, msg string) { + select { + case <-k.done: + default: + k.outgoing <- func() (mesos.Status, error) { return driver.SendFrameworkMessage(msg) } + } +} + +func (k *KubernetesExecutor) sendLoop() { + defer log.V(1).Info("sender loop exiting") + for { + select { + case <-k.done: + return + default: + if !k.isConnected() { + select { + case <-k.done: + case <-time.After(1 * time.Second): + } + continue + } + sender, ok := <-k.outgoing + if !ok { + // programming error + panic("someone closed the outgoing channel") + } + if status, err := sender(); err == nil { + continue + } else { + log.Error(err) + if status == mesos.Status_DRIVER_ABORTED { + return + } + } + // attempt to re-queue the sender + select { + case <-k.done: + case k.outgoing <- sender: + } + } + } +} diff --git a/contrib/mesos/pkg/executor/executor_test.go b/contrib/mesos/pkg/executor/executor_test.go new file mode 100644 index 00000000000..96f87270269 --- /dev/null +++ b/contrib/mesos/pkg/executor/executor_test.go @@ -0,0 +1,618 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package executor + +import ( + "fmt" + "net/http" + "net/http/httptest" + "reflect" + "sync" + "sync/atomic" + "testing" + "time" + + assertext "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/assert" + "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/executor/messages" + kmruntime "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/runtime" + "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/podtask" + "github.com/GoogleCloudPlatform/kubernetes/pkg/api" + "github.com/GoogleCloudPlatform/kubernetes/pkg/api/testapi" + "github.com/GoogleCloudPlatform/kubernetes/pkg/client" + "github.com/GoogleCloudPlatform/kubernetes/pkg/client/cache" + "github.com/GoogleCloudPlatform/kubernetes/pkg/kubelet" + "github.com/GoogleCloudPlatform/kubernetes/pkg/kubelet/dockertools" + "github.com/GoogleCloudPlatform/kubernetes/pkg/runtime" + "github.com/GoogleCloudPlatform/kubernetes/pkg/watch" + + "github.com/golang/glog" + bindings "github.com/mesos/mesos-go/executor" + "github.com/mesos/mesos-go/mesosproto" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/mock" +) + +type suicideTracker struct { + suicideWatcher + stops uint32 + resets uint32 + timers uint32 + jumps *uint32 +} + +func (t *suicideTracker) Reset(d time.Duration) bool { + defer func() { t.resets++ }() + return t.suicideWatcher.Reset(d) +} + +func (t *suicideTracker) Stop() bool { + defer func() { t.stops++ }() + return t.suicideWatcher.Stop() +} + +func (t *suicideTracker) Next(d time.Duration, driver bindings.ExecutorDriver, f jumper) suicideWatcher { + tracker := &suicideTracker{ + stops: t.stops, + resets: t.resets, + jumps: t.jumps, + timers: t.timers + 1, + } + jumper := tracker.makeJumper(f) + tracker.suicideWatcher = t.suicideWatcher.Next(d, driver, jumper) + return tracker +} + +func (t *suicideTracker) makeJumper(_ jumper) jumper { + return jumper(func(driver bindings.ExecutorDriver, cancel <-chan struct{}) { + glog.Warningln("jumping?!") + if t.jumps != nil { + atomic.AddUint32(t.jumps, 1) + } + }) +} + +func TestSuicide_zeroTimeout(t *testing.T) { + defer glog.Flush() + + k := New(Config{}) + tracker := &suicideTracker{suicideWatcher: k.suicideWatch} + k.suicideWatch = tracker + + ch := k.resetSuicideWatch(nil) + + select { + case <-ch: + case <-time.After(2 * time.Second): + t.Fatalf("timeout waiting for reset of suicide watch") + } + if tracker.stops != 0 { + t.Fatalf("expected no stops since suicideWatchTimeout was never set") + } + if tracker.resets != 0 { + t.Fatalf("expected no resets since suicideWatchTimeout was never set") + } + if tracker.timers != 0 { + t.Fatalf("expected no timers since suicideWatchTimeout was never set") + } +} + +func TestSuicide_WithTasks(t *testing.T) { + defer glog.Flush() + + k := New(Config{ + SuicideTimeout: 50 * time.Millisecond, + }) + + jumps := uint32(0) + tracker := &suicideTracker{suicideWatcher: k.suicideWatch, jumps: &jumps} + k.suicideWatch = tracker + + k.tasks["foo"] = &kuberTask{} // prevent suicide attempts from succeeding + + // call reset with a nil timer + glog.Infoln("resetting suicide watch with 1 task") + select { + case <-k.resetSuicideWatch(nil): + tracker = k.suicideWatch.(*suicideTracker) + if tracker.stops != 1 { + t.Fatalf("expected suicide attempt to Stop() since there are registered tasks") + } + if tracker.resets != 0 { + t.Fatalf("expected no resets since") + } + if tracker.timers != 0 { + t.Fatalf("expected no timers since") + } + case <-time.After(1 * time.Second): + t.Fatalf("initial suicide watch setup failed") + } + + delete(k.tasks, "foo") // zero remaining tasks + k.suicideTimeout = 1500 * time.Millisecond + suicideStart := time.Now() + + // reset the suicide watch, which should actually start a timer now + glog.Infoln("resetting suicide watch with 0 tasks") + select { + case <-k.resetSuicideWatch(nil): + tracker = k.suicideWatch.(*suicideTracker) + if tracker.stops != 1 { + t.Fatalf("did not expect suicide attempt to Stop() since there are no registered tasks") + } + if tracker.resets != 1 { + t.Fatalf("expected 1 resets instead of %d", tracker.resets) + } + if tracker.timers != 1 { + t.Fatalf("expected 1 timers instead of %d", tracker.timers) + } + case <-time.After(1 * time.Second): + t.Fatalf("2nd suicide watch setup failed") + } + + k.lock.Lock() + k.tasks["foo"] = &kuberTask{} // prevent suicide attempts from succeeding + k.lock.Unlock() + + // reset the suicide watch, which should stop the existing timer + glog.Infoln("resetting suicide watch with 1 task") + select { + case <-k.resetSuicideWatch(nil): + tracker = k.suicideWatch.(*suicideTracker) + if tracker.stops != 2 { + t.Fatalf("expected 2 stops instead of %d since there are registered tasks", tracker.stops) + } + if tracker.resets != 1 { + t.Fatalf("expected 1 resets instead of %d", tracker.resets) + } + if tracker.timers != 1 { + t.Fatalf("expected 1 timers instead of %d", tracker.timers) + } + case <-time.After(1 * time.Second): + t.Fatalf("3rd suicide watch setup failed") + } + + k.lock.Lock() + delete(k.tasks, "foo") // allow suicide attempts to schedule + k.lock.Unlock() + + // reset the suicide watch, which should reset a stopped timer + glog.Infoln("resetting suicide watch with 0 tasks") + select { + case <-k.resetSuicideWatch(nil): + tracker = k.suicideWatch.(*suicideTracker) + if tracker.stops != 2 { + t.Fatalf("expected 2 stops instead of %d since there are no registered tasks", tracker.stops) + } + if tracker.resets != 2 { + t.Fatalf("expected 2 resets instead of %d", tracker.resets) + } + if tracker.timers != 1 { + t.Fatalf("expected 1 timers instead of %d", tracker.timers) + } + case <-time.After(1 * time.Second): + t.Fatalf("4th suicide watch setup failed") + } + + sinceWatch := time.Since(suicideStart) + time.Sleep(3*time.Second - sinceWatch) // give the first timer to misfire (it shouldn't since Stop() was called) + + if j := atomic.LoadUint32(&jumps); j != 1 { + t.Fatalf("expected 1 jumps instead of %d since stop was called", j) + } else { + glog.Infoln("jumps verified") // glog so we get a timestamp + } +} + +// TestExecutorRegister ensures that the executor thinks it is connected +// after Register is called. +func TestExecutorRegister(t *testing.T) { + mockDriver := &MockExecutorDriver{} + updates := make(chan interface{}, 1024) + executor := New(Config{ + Docker: dockertools.ConnectToDockerOrDie("fake://"), + Updates: updates, + SourceName: "executor_test", + }) + + executor.Init(mockDriver) + executor.Registered(mockDriver, nil, nil, nil) + + initialPodUpdate := kubelet.PodUpdate{ + Pods: []*api.Pod{}, + Op: kubelet.SET, + Source: executor.sourcename, + } + receivedInitialPodUpdate := false + select { + case m := <-updates: + update, ok := m.(kubelet.PodUpdate) + if ok { + if reflect.DeepEqual(initialPodUpdate, update) { + receivedInitialPodUpdate = true + } + } + case <-time.After(time.Second): + } + assert.Equal(t, true, receivedInitialPodUpdate, + "executor should have sent an initial PodUpdate "+ + "to the updates chan upon registration") + + assert.Equal(t, true, executor.isConnected(), "executor should be connected") + mockDriver.AssertExpectations(t) +} + +// TestExecutorDisconnect ensures that the executor thinks that it is not +// connected after a call to Disconnected has occured. +func TestExecutorDisconnect(t *testing.T) { + mockDriver := &MockExecutorDriver{} + executor := NewTestKubernetesExecutor() + + executor.Init(mockDriver) + executor.Registered(mockDriver, nil, nil, nil) + executor.Disconnected(mockDriver) + + assert.Equal(t, false, executor.isConnected(), + "executor should not be connected after Disconnected") + mockDriver.AssertExpectations(t) +} + +// TestExecutorReregister ensures that the executor thinks it is connected +// after a connection problem happens, followed by a call to Reregistered. +func TestExecutorReregister(t *testing.T) { + mockDriver := &MockExecutorDriver{} + executor := NewTestKubernetesExecutor() + + executor.Init(mockDriver) + executor.Registered(mockDriver, nil, nil, nil) + executor.Disconnected(mockDriver) + executor.Reregistered(mockDriver, nil) + + assert.Equal(t, true, executor.isConnected(), "executor should be connected") + mockDriver.AssertExpectations(t) +} + +// TestExecutorLaunchAndKillTask ensures that the executor is able to launch +// and kill tasks while properly bookkeping its tasks. +func TestExecutorLaunchAndKillTask(t *testing.T) { + // create a fake pod watch. We use that below to submit new pods to the scheduler + podListWatch := NewMockPodsListWatch(api.PodList{}) + + // create fake apiserver + testApiServer := NewTestServer(t, api.NamespaceDefault, &podListWatch.list) + defer testApiServer.server.Close() + + mockDriver := &MockExecutorDriver{} + updates := make(chan interface{}, 1024) + config := Config{ + Docker: dockertools.ConnectToDockerOrDie("fake://"), + Updates: updates, + APIClient: client.NewOrDie(&client.Config{ + Host: testApiServer.server.URL, + Version: testapi.Version(), + }), + Kubelet: &kubelet.Kubelet{}, + PodStatusFunc: func(kl *kubelet.Kubelet, pod *api.Pod) (*api.PodStatus, error) { + return &api.PodStatus{ + ContainerStatuses: []api.ContainerStatus{ + { + Name: "foo", + State: api.ContainerState{ + Running: &api.ContainerStateRunning{}, + }, + }, + }, + Phase: api.PodRunning, + }, nil + }, + } + executor := New(config) + + executor.Init(mockDriver) + executor.Registered(mockDriver, nil, nil, nil) + + select { + case <-updates: + case <-time.After(time.Second): + t.Fatalf("Executor should send an intial update on Registration") + } + + pod := NewTestPod(1) + podTask, err := podtask.New(api.NewDefaultContext(), "", + *pod, &mesosproto.ExecutorInfo{}) + assert.Equal(t, nil, err, "must be able to create a task from a pod") + + taskInfo := podTask.BuildTaskInfo() + data, err := testapi.Codec().Encode(pod) + assert.Equal(t, nil, err, "must be able to encode a pod's spec data") + taskInfo.Data = data + var statusUpdateCalls sync.WaitGroup + statusUpdateDone := func(_ mock.Arguments) { statusUpdateCalls.Done() } + + statusUpdateCalls.Add(1) + mockDriver.On( + "SendStatusUpdate", + mesosproto.TaskState_TASK_STARTING, + ).Return(mesosproto.Status_DRIVER_RUNNING, nil).Run(statusUpdateDone).Once() + + statusUpdateCalls.Add(1) + mockDriver.On( + "SendStatusUpdate", + mesosproto.TaskState_TASK_RUNNING, + ).Return(mesosproto.Status_DRIVER_RUNNING, nil).Run(statusUpdateDone).Once() + + executor.LaunchTask(mockDriver, taskInfo) + + assertext.EventuallyTrue(t, 5*time.Second, func() bool { + executor.lock.Lock() + defer executor.lock.Unlock() + return len(executor.tasks) == 1 && len(executor.pods) == 1 + }, "executor must be able to create a task and a pod") + + gotPodUpdate := false + select { + case m := <-updates: + update, ok := m.(kubelet.PodUpdate) + if ok && len(update.Pods) == 1 { + gotPodUpdate = true + } + case <-time.After(time.Second): + } + assert.Equal(t, true, gotPodUpdate, + "the executor should send an update about a new pod to "+ + "the updates chan when creating a new one.") + + // Allow some time for asynchronous requests to the driver. + finished := kmruntime.After(statusUpdateCalls.Wait) + select { + case <-finished: + case <-time.After(5 * time.Second): + t.Fatalf("timed out waiting for status update calls to finish") + } + + statusUpdateCalls.Add(1) + mockDriver.On( + "SendStatusUpdate", + mesosproto.TaskState_TASK_KILLED, + ).Return(mesosproto.Status_DRIVER_RUNNING, nil).Run(statusUpdateDone).Once() + + executor.KillTask(mockDriver, taskInfo.TaskId) + + assertext.EventuallyTrue(t, 5*time.Second, func() bool { + executor.lock.Lock() + defer executor.lock.Unlock() + return len(executor.tasks) == 0 && len(executor.pods) == 0 + }, "executor must be able to kill a created task and pod") + + // Allow some time for asynchronous requests to the driver. + finished = kmruntime.After(statusUpdateCalls.Wait) + select { + case <-finished: + case <-time.After(5 * time.Second): + t.Fatalf("timed out waiting for status update calls to finish") + } + mockDriver.AssertExpectations(t) +} + +// TestExecutorFrameworkMessage ensures that the executor is able to +// handle messages from the framework, specifically about lost tasks +// and Kamikaze. When a task is lost, the executor needs to clean up +// its state. When a Kamikaze message is received, the executor should +// attempt suicide. +func TestExecutorFrameworkMessage(t *testing.T) { + mockDriver := &MockExecutorDriver{} + kubeletFinished := make(chan struct{}) + config := Config{ + Docker: dockertools.ConnectToDockerOrDie("fake://"), + Updates: make(chan interface{}, 1024), + APIClient: client.NewOrDie(&client.Config{ + Host: "fakehost", + Version: testapi.Version(), + }), + ShutdownAlert: func() { + close(kubeletFinished) + }, + KubeletFinished: kubeletFinished, + } + executor := New(config) + + executor.Init(mockDriver) + executor.Registered(mockDriver, nil, nil, nil) + + executor.FrameworkMessage(mockDriver, "test framework message") + + // set up a pod to then lose + pod := NewTestPod(1) + podTask, _ := podtask.New(api.NewDefaultContext(), "foo", + *pod, &mesosproto.ExecutorInfo{}) + + taskInfo := podTask.BuildTaskInfo() + data, _ := testapi.Codec().Encode(pod) + taskInfo.Data = data + + executor.LaunchTask(mockDriver, taskInfo) + + // send task-lost message for it + called := make(chan struct{}) + mockDriver.On( + "SendStatusUpdate", + mesosproto.TaskState_TASK_LOST, + ).Return(mesosproto.Status_DRIVER_RUNNING, nil).Run(func(_ mock.Arguments) { close(called) }).Once() + + executor.FrameworkMessage(mockDriver, "task-lost:foo") + assertext.EventuallyTrue(t, 5*time.Second, func() bool { + executor.lock.Lock() + defer executor.lock.Unlock() + return len(executor.tasks) == 0 && len(executor.pods) == 0 + }, "executor must be able to kill a created task and pod") + + select { + case <-called: + case <-time.After(5 * time.Second): + t.Fatalf("timed out waiting for SendStatusUpdate") + } + + mockDriver.On("Stop").Return(mesosproto.Status_DRIVER_STOPPED, nil).Once() + + executor.FrameworkMessage(mockDriver, messages.Kamikaze) + assert.Equal(t, true, executor.isDone(), + "executor should have shut down after receiving a Kamikaze message") + + mockDriver.AssertExpectations(t) +} + +// Create a pod with a given index, requiring one port +func NewTestPod(i int) *api.Pod { + name := fmt.Sprintf("pod%d", i) + return &api.Pod{ + TypeMeta: api.TypeMeta{APIVersion: testapi.Version()}, + ObjectMeta: api.ObjectMeta{ + Name: name, + Namespace: api.NamespaceDefault, + SelfLink: testapi.SelfLink("pods", string(i)), + }, + Spec: api.PodSpec{ + Containers: []api.Container{ + { + Ports: []api.ContainerPort{ + { + ContainerPort: 8000 + i, + Protocol: api.ProtocolTCP, + }, + }, + }, + }, + }, + Status: api.PodStatus{ + Conditions: []api.PodCondition{ + { + Type: api.PodReady, + Status: api.ConditionTrue, + }, + }, + }, + } +} + +// Create mock of pods ListWatch, usually listening on the apiserver pods watch endpoint +type MockPodsListWatch struct { + ListWatch cache.ListWatch + fakeWatcher *watch.FakeWatcher + list api.PodList +} + +// A apiserver mock which partially mocks the pods API +type TestServer struct { + server *httptest.Server + Stats map[string]uint + lock sync.Mutex +} + +func NewTestServer(t *testing.T, namespace string, pods *api.PodList) *TestServer { + ts := TestServer{ + Stats: map[string]uint{}, + } + mux := http.NewServeMux() + + mux.HandleFunc(testapi.ResourcePath("bindings", namespace, ""), func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + }) + + ts.server = httptest.NewServer(mux) + return &ts +} + +func NewMockPodsListWatch(initialPodList api.PodList) *MockPodsListWatch { + lw := MockPodsListWatch{ + fakeWatcher: watch.NewFake(), + list: initialPodList, + } + lw.ListWatch = cache.ListWatch{ + WatchFunc: func(resourceVersion string) (watch.Interface, error) { + return lw.fakeWatcher, nil + }, + ListFunc: func() (runtime.Object, error) { + return &lw.list, nil + }, + } + return &lw +} + +// TestExecutorShutdown ensures that the executor properly shuts down +// when Shutdown is called. +func TestExecutorShutdown(t *testing.T) { + mockDriver := &MockExecutorDriver{} + kubeletFinished := make(chan struct{}) + var exitCalled int32 = 0 + config := Config{ + Docker: dockertools.ConnectToDockerOrDie("fake://"), + Updates: make(chan interface{}, 1024), + ShutdownAlert: func() { + close(kubeletFinished) + }, + KubeletFinished: kubeletFinished, + ExitFunc: func(_ int) { + atomic.AddInt32(&exitCalled, 1) + }, + } + executor := New(config) + + executor.Init(mockDriver) + executor.Registered(mockDriver, nil, nil, nil) + + mockDriver.On("Stop").Return(mesosproto.Status_DRIVER_STOPPED, nil).Once() + + executor.Shutdown(mockDriver) + + assert.Equal(t, false, executor.isConnected(), + "executor should not be connected after Shutdown") + assert.Equal(t, true, executor.isDone(), + "executor should be in Done state after Shutdown") + + select { + case <-executor.Done(): + default: + t.Fatal("done channel should be closed after shutdown") + } + + assert.Equal(t, true, atomic.LoadInt32(&exitCalled) > 0, + "the executor should call its ExitFunc when it is ready to close down") + + mockDriver.AssertExpectations(t) +} + +func TestExecutorsendFrameworkMessage(t *testing.T) { + mockDriver := &MockExecutorDriver{} + executor := NewTestKubernetesExecutor() + + executor.Init(mockDriver) + executor.Registered(mockDriver, nil, nil, nil) + + called := make(chan struct{}) + mockDriver.On( + "SendFrameworkMessage", + "foo bar baz", + ).Return(mesosproto.Status_DRIVER_RUNNING, nil).Run(func(_ mock.Arguments) { close(called) }).Once() + executor.sendFrameworkMessage(mockDriver, "foo bar baz") + + // guard against data race in mock driver between AssertExpectations and Called + select { + case <-called: // expected + case <-time.After(5 * time.Second): + t.Fatalf("expected call to SendFrameworkMessage") + } + mockDriver.AssertExpectations(t) +} diff --git a/contrib/mesos/pkg/executor/messages/doc.go b/contrib/mesos/pkg/executor/messages/doc.go new file mode 100644 index 00000000000..ac09f189b56 --- /dev/null +++ b/contrib/mesos/pkg/executor/messages/doc.go @@ -0,0 +1,18 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package messages exposes executor event/message names as constants. +package messages diff --git a/contrib/mesos/pkg/executor/messages/messages.go b/contrib/mesos/pkg/executor/messages/messages.go new file mode 100644 index 00000000000..bf3dd1a9888 --- /dev/null +++ b/contrib/mesos/pkg/executor/messages/messages.go @@ -0,0 +1,32 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package messages + +// messages that ship with TaskStatus objects + +const ( + ContainersDisappeared = "containers-disappeared" + CreateBindingFailure = "create-binding-failure" + CreateBindingSuccess = "create-binding-success" + ExecutorUnregistered = "executor-unregistered" + ExecutorShutdown = "executor-shutdown" + LaunchTaskFailed = "launch-task-failed" + TaskKilled = "task-killed" + UnmarshalTaskDataFailure = "unmarshal-task-data-failure" + TaskLostAck = "task-lost-ack" // executor acknowledgement of forwarded TASK_LOST framework message + Kamikaze = "kamikaze" +) diff --git a/contrib/mesos/pkg/executor/mock_test.go b/contrib/mesos/pkg/executor/mock_test.go new file mode 100644 index 00000000000..4b060024a52 --- /dev/null +++ b/contrib/mesos/pkg/executor/mock_test.go @@ -0,0 +1,81 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package executor + +import ( + "testing" + + "github.com/GoogleCloudPlatform/kubernetes/pkg/kubelet/dockertools" + "github.com/mesos/mesos-go/mesosproto" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/mock" +) + +type MockExecutorDriver struct { + mock.Mock +} + +func (m *MockExecutorDriver) Start() (mesosproto.Status, error) { + args := m.Called() + return args.Get(0).(mesosproto.Status), args.Error(1) +} + +func (m *MockExecutorDriver) Stop() (mesosproto.Status, error) { + args := m.Called() + return args.Get(0).(mesosproto.Status), args.Error(1) +} + +func (m *MockExecutorDriver) Abort() (mesosproto.Status, error) { + args := m.Called() + return args.Get(0).(mesosproto.Status), args.Error(1) +} + +func (m *MockExecutorDriver) Join() (mesosproto.Status, error) { + args := m.Called() + return args.Get(0).(mesosproto.Status), args.Error(1) +} + +func (m *MockExecutorDriver) Run() (mesosproto.Status, error) { + args := m.Called() + return args.Get(0).(mesosproto.Status), args.Error(1) +} + +func (m *MockExecutorDriver) SendStatusUpdate(taskStatus *mesosproto.TaskStatus) (mesosproto.Status, error) { + args := m.Called(*taskStatus.State) + return args.Get(0).(mesosproto.Status), args.Error(1) +} + +func (m *MockExecutorDriver) SendFrameworkMessage(msg string) (mesosproto.Status, error) { + args := m.Called(msg) + return args.Get(0).(mesosproto.Status), args.Error(1) +} + +func NewTestKubernetesExecutor() *KubernetesExecutor { + return New(Config{ + Docker: dockertools.ConnectToDockerOrDie("fake://"), + Updates: make(chan interface{}, 1024), + }) +} + +func TestExecutorNew(t *testing.T) { + mockDriver := &MockExecutorDriver{} + executor := NewTestKubernetesExecutor() + executor.Init(mockDriver) + + assert.Equal(t, executor.isDone(), false, "executor should not be in Done state on initialization") + assert.Equal(t, executor.isConnected(), false, "executor should not be connected on initialization") +} diff --git a/contrib/mesos/pkg/executor/service/doc.go b/contrib/mesos/pkg/executor/service/doc.go new file mode 100644 index 00000000000..f915ee4239f --- /dev/null +++ b/contrib/mesos/pkg/executor/service/doc.go @@ -0,0 +1,18 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package service contains the cmd/k8sm-executor glue code. +package service diff --git a/contrib/mesos/pkg/executor/service/service.go b/contrib/mesos/pkg/executor/service/service.go new file mode 100644 index 00000000000..5de0da0db65 --- /dev/null +++ b/contrib/mesos/pkg/executor/service/service.go @@ -0,0 +1,600 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package service + +import ( + "bufio" + "fmt" + "io" + "math/rand" + "net" + "net/http" + "os" + "os/exec" + "strconv" + "strings" + "sync" + "time" + + "github.com/GoogleCloudPlatform/kubernetes/cmd/kubelet/app" + "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/executor" + "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/executor/config" + "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/hyperkube" + "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/redirfd" + "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/runtime" + "github.com/GoogleCloudPlatform/kubernetes/pkg/api" + "github.com/GoogleCloudPlatform/kubernetes/pkg/client" + "github.com/GoogleCloudPlatform/kubernetes/pkg/credentialprovider" + "github.com/GoogleCloudPlatform/kubernetes/pkg/healthz" + "github.com/GoogleCloudPlatform/kubernetes/pkg/kubelet" + "github.com/GoogleCloudPlatform/kubernetes/pkg/kubelet/cadvisor" + kconfig "github.com/GoogleCloudPlatform/kubernetes/pkg/kubelet/config" + "github.com/GoogleCloudPlatform/kubernetes/pkg/kubelet/dockertools" + "github.com/GoogleCloudPlatform/kubernetes/pkg/util" + "github.com/GoogleCloudPlatform/kubernetes/pkg/util/mount" + log "github.com/golang/glog" + "github.com/kardianos/osext" + bindings "github.com/mesos/mesos-go/executor" + + "github.com/spf13/pflag" +) + +const ( + // if we don't use this source then the kubelet will do funny, mirror things. + // @see ConfigSourceAnnotationKey + MESOS_CFG_SOURCE = kubelet.ApiserverSource +) + +type KubeletExecutorServer struct { + *app.KubeletServer + RunProxy bool + ProxyLogV int + ProxyExec string + ProxyLogfile string + ProxyBindall bool + SuicideTimeout time.Duration + ShutdownFD int + ShutdownFIFO string +} + +func NewKubeletExecutorServer() *KubeletExecutorServer { + k := &KubeletExecutorServer{ + KubeletServer: app.NewKubeletServer(), + RunProxy: true, + ProxyExec: "./kube-proxy", + ProxyLogfile: "./proxy-log", + SuicideTimeout: config.DefaultSuicideTimeout, + } + if pwd, err := os.Getwd(); err != nil { + log.Warningf("failed to determine current directory: %v", err) + } else { + k.RootDirectory = pwd // mesos sandbox dir + } + k.Address = util.IP(net.ParseIP(defaultBindingAddress())) + k.ShutdownFD = -1 // indicates unspecified FD + return k +} + +func NewHyperKubeletExecutorServer() *KubeletExecutorServer { + s := NewKubeletExecutorServer() + + // cache this for later use + binary, err := osext.Executable() + if err != nil { + log.Fatalf("failed to determine currently running executable: %v", err) + } + + s.ProxyExec = binary + return s +} + +func (s *KubeletExecutorServer) addCoreFlags(fs *pflag.FlagSet) { + s.KubeletServer.AddFlags(fs) + fs.BoolVar(&s.RunProxy, "run-proxy", s.RunProxy, "Maintain a running kube-proxy instance as a child proc of this kubelet-executor.") + fs.IntVar(&s.ProxyLogV, "proxy-logv", s.ProxyLogV, "Log verbosity of the child kube-proxy.") + fs.StringVar(&s.ProxyLogfile, "proxy-logfile", s.ProxyLogfile, "Path to the kube-proxy log file.") + fs.BoolVar(&s.ProxyBindall, "proxy-bindall", s.ProxyBindall, "When true will cause kube-proxy to bind to 0.0.0.0.") + fs.DurationVar(&s.SuicideTimeout, "suicide-timeout", s.SuicideTimeout, "Self-terminate after this period of inactivity. Zero disables suicide watch.") + fs.IntVar(&s.ShutdownFD, "shutdown-fd", s.ShutdownFD, "File descriptor used to signal shutdown to external watchers, requires shutdown-fifo flag") + fs.StringVar(&s.ShutdownFIFO, "shutdown-fifo", s.ShutdownFIFO, "FIFO used to signal shutdown to external watchers, requires shutdown-fd flag") +} + +func (s *KubeletExecutorServer) AddStandaloneFlags(fs *pflag.FlagSet) { + s.addCoreFlags(fs) + fs.StringVar(&s.ProxyExec, "proxy-exec", s.ProxyExec, "Path to the kube-proxy executable.") +} + +func (s *KubeletExecutorServer) AddHyperkubeFlags(fs *pflag.FlagSet) { + s.addCoreFlags(fs) +} + +// returns a Closer that should be closed to signal impending shutdown, but only if ShutdownFD +// and ShutdownFIFO were specified. if they are specified, then this func blocks until there's +// a reader on the FIFO stream. +func (s *KubeletExecutorServer) syncExternalShutdownWatcher() (io.Closer, error) { + if s.ShutdownFD == -1 || s.ShutdownFIFO == "" { + return nil, nil + } + // redirfd -w n fifo ... # (blocks until the fifo is read) + log.Infof("blocked, waiting for shutdown reader for FD %d FIFO at %s", s.ShutdownFD, s.ShutdownFIFO) + return redirfd.Write.Redirect(true, false, redirfd.FileDescriptor(s.ShutdownFD), s.ShutdownFIFO) +} + +// Run runs the specified KubeletExecutorServer. +func (s *KubeletExecutorServer) Run(hks hyperkube.Interface, _ []string) error { + rand.Seed(time.Now().UTC().UnixNano()) + + if err := util.ApplyOomScoreAdj(0, s.OOMScoreAdj); err != nil { + log.Info(err) + } + + var apiclient *client.Client + clientConfig, err := s.CreateAPIServerClientConfig() + if err == nil { + apiclient, err = client.New(clientConfig) + } + if err != nil { + // required for k8sm since we need to send api.Binding information + // back to the apiserver + log.Fatalf("No API client: %v", err) + } + + log.Infof("Using root directory: %v", s.RootDirectory) + credentialprovider.SetPreferredDockercfgPath(s.RootDirectory) + + shutdownCloser, err := s.syncExternalShutdownWatcher() + if err != nil { + return err + } + + cadvisorInterface, err := cadvisor.New(s.CadvisorPort) + if err != nil { + return err + } + + imageGCPolicy := kubelet.ImageGCPolicy{ + HighThresholdPercent: s.ImageGCHighThresholdPercent, + LowThresholdPercent: s.ImageGCLowThresholdPercent, + } + + diskSpacePolicy := kubelet.DiskSpacePolicy{ + DockerFreeDiskMB: s.LowDiskSpaceThresholdMB, + RootFreeDiskMB: s.LowDiskSpaceThresholdMB, + } + + //TODO(jdef) intentionally NOT initializing a cloud provider here since: + //(a) the kubelet doesn't actually use it + //(b) we don't need to create N-kubelet connections to zookeeper for no good reason + //cloud := cloudprovider.InitCloudProvider(s.CloudProvider, s.CloudConfigFile) + //log.Infof("Successfully initialized cloud provider: %q from the config file: %q\n", s.CloudProvider, s.CloudConfigFile) + + hostNetworkSources, err := kubelet.GetValidatedSources(strings.Split(s.HostNetworkSources, ",")) + if err != nil { + return err + } + + tlsOptions, err := s.InitializeTLS() + if err != nil { + return err + } + mounter := mount.New() + if s.Containerized { + log.V(2).Info("Running kubelet in containerized mode (experimental)") + mounter = &mount.NsenterMounter{} + } + + var dockerExecHandler dockertools.ExecHandler + switch s.DockerExecHandlerName { + case "native": + dockerExecHandler = &dockertools.NativeExecHandler{} + case "nsenter": + dockerExecHandler = &dockertools.NsenterExecHandler{} + default: + log.Warningf("Unknown Docker exec handler %q; defaulting to native", s.DockerExecHandlerName) + dockerExecHandler = &dockertools.NativeExecHandler{} + } + + kcfg := app.KubeletConfig{ + Address: s.Address, + AllowPrivileged: s.AllowPrivileged, + HostNetworkSources: hostNetworkSources, + HostnameOverride: s.HostnameOverride, + RootDirectory: s.RootDirectory, + // ConfigFile: "" + // ManifestURL: "" + // FileCheckFrequency + // HTTPCheckFrequency + PodInfraContainerImage: s.PodInfraContainerImage, + SyncFrequency: s.SyncFrequency, + RegistryPullQPS: s.RegistryPullQPS, + RegistryBurst: s.RegistryBurst, + MinimumGCAge: s.MinimumGCAge, + MaxPerPodContainerCount: s.MaxPerPodContainerCount, + MaxContainerCount: s.MaxContainerCount, + RegisterNode: s.RegisterNode, + ClusterDomain: s.ClusterDomain, + ClusterDNS: s.ClusterDNS, + Runonce: s.RunOnce, + Port: s.Port, + ReadOnlyPort: s.ReadOnlyPort, + CadvisorInterface: cadvisorInterface, + EnableServer: s.EnableServer, + EnableDebuggingHandlers: s.EnableDebuggingHandlers, + DockerClient: dockertools.ConnectToDockerOrDie(s.DockerEndpoint), + KubeClient: apiclient, + MasterServiceNamespace: s.MasterServiceNamespace, + VolumePlugins: app.ProbeVolumePlugins(), + NetworkPlugins: app.ProbeNetworkPlugins(), + NetworkPluginName: s.NetworkPluginName, + StreamingConnectionIdleTimeout: s.StreamingConnectionIdleTimeout, + TLSOptions: tlsOptions, + ImageGCPolicy: imageGCPolicy, + DiskSpacePolicy: diskSpacePolicy, + Cloud: nil, // TODO(jdef) Cloud, specifying null here because we don't want all kubelets polling mesos-master; need to account for this in the cloudprovider impl + NodeStatusUpdateFrequency: s.NodeStatusUpdateFrequency, + ResourceContainer: s.ResourceContainer, + CgroupRoot: s.CgroupRoot, + ContainerRuntime: s.ContainerRuntime, + Mounter: mounter, + DockerDaemonContainer: s.DockerDaemonContainer, + SystemContainer: s.SystemContainer, + ConfigureCBR0: s.ConfigureCBR0, + MaxPods: s.MaxPods, + DockerExecHandler: dockerExecHandler, + } + + err = app.RunKubelet(&kcfg, app.KubeletBuilder(func(kc *app.KubeletConfig) (app.KubeletBootstrap, *kconfig.PodConfig, error) { + return s.createAndInitKubelet(kc, hks, clientConfig, shutdownCloser) + })) + if err != nil { + return err + } + + if s.HealthzPort > 0 { + healthz.DefaultHealthz() + go util.Forever(func() { + err := http.ListenAndServe(net.JoinHostPort(s.HealthzBindAddress.String(), strconv.Itoa(s.HealthzPort)), nil) + if err != nil { + log.Errorf("Starting health server failed: %v", err) + } + }, 5*time.Second) + } + + // block until executor is shut down or commits shutdown + select {} +} + +func defaultBindingAddress() string { + libProcessIP := os.Getenv("LIBPROCESS_IP") + if libProcessIP == "" { + return "0.0.0.0" + } else { + return libProcessIP + } +} + +func (ks *KubeletExecutorServer) createAndInitKubelet( + kc *app.KubeletConfig, + hks hyperkube.Interface, + clientConfig *client.Config, + shutdownCloser io.Closer, +) (app.KubeletBootstrap, *kconfig.PodConfig, error) { + + // TODO(k8s): block until all sources have delivered at least one update to the channel, or break the sync loop + // up into "per source" synchronizations + // TODO(k8s): KubeletConfig.KubeClient should be a client interface, but client interface misses certain methods + // used by kubelet. Since NewMainKubelet expects a client interface, we need to make sure we are not passing + // a nil pointer to it when what we really want is a nil interface. + var kubeClient client.Interface + if kc.KubeClient == nil { + kubeClient = nil + } else { + kubeClient = kc.KubeClient + } + + gcPolicy := kubelet.ContainerGCPolicy{ + MinAge: kc.MinimumGCAge, + MaxPerPodContainer: kc.MaxPerPodContainerCount, + MaxContainers: kc.MaxContainerCount, + } + + pc := kconfig.NewPodConfig(kconfig.PodConfigNotificationSnapshotAndUpdates, kc.Recorder) + updates := pc.Channel(MESOS_CFG_SOURCE) + + klet, err := kubelet.NewMainKubelet( + kc.Hostname, + kc.DockerClient, + kubeClient, + kc.RootDirectory, + kc.PodInfraContainerImage, + kc.SyncFrequency, + float32(kc.RegistryPullQPS), + kc.RegistryBurst, + gcPolicy, + pc.SeenAllSources, + kc.RegisterNode, + kc.ClusterDomain, + net.IP(kc.ClusterDNS), + kc.MasterServiceNamespace, + kc.VolumePlugins, + kc.NetworkPlugins, + kc.NetworkPluginName, + kc.StreamingConnectionIdleTimeout, + kc.Recorder, + kc.CadvisorInterface, + kc.ImageGCPolicy, + kc.DiskSpacePolicy, + kc.Cloud, + kc.NodeStatusUpdateFrequency, + kc.ResourceContainer, + kc.OSInterface, + kc.CgroupRoot, + kc.ContainerRuntime, + kc.Mounter, + kc.DockerDaemonContainer, + kc.SystemContainer, + kc.ConfigureCBR0, + kc.MaxPods, + kc.DockerExecHandler, + ) + if err != nil { + return nil, nil, err + } + + //TODO(jdef) either configure Watch here with something useful, or else + // get rid of it from executor.Config + kubeletFinished := make(chan struct{}) + exec := executor.New(executor.Config{ + Kubelet: klet, + Updates: updates, + SourceName: MESOS_CFG_SOURCE, + APIClient: kc.KubeClient, + Docker: kc.DockerClient, + SuicideTimeout: ks.SuicideTimeout, + KubeletFinished: kubeletFinished, + ShutdownAlert: func() { + if shutdownCloser != nil { + if e := shutdownCloser.Close(); e != nil { + log.Warningf("failed to signal shutdown to external watcher: %v", e) + } + } + }, + ExitFunc: os.Exit, + PodStatusFunc: func(kl *kubelet.Kubelet, pod *api.Pod) (*api.PodStatus, error) { + return kl.GetRuntime().GetPodStatus(pod) + }, + }) + + k := &kubeletExecutor{ + Kubelet: klet, + runProxy: ks.RunProxy, + proxyLogV: ks.ProxyLogV, + proxyExec: ks.ProxyExec, + proxyLogfile: ks.ProxyLogfile, + proxyBindall: ks.ProxyBindall, + address: ks.Address, + dockerClient: kc.DockerClient, + hks: hks, + kubeletFinished: kubeletFinished, + executorDone: exec.Done(), + clientConfig: clientConfig, + } + + dconfig := bindings.DriverConfig{ + Executor: exec, + HostnameOverride: ks.HostnameOverride, + BindingAddress: net.IP(ks.Address), + } + if driver, err := bindings.NewMesosExecutorDriver(dconfig); err != nil { + log.Fatalf("failed to create executor driver: %v", err) + } else { + k.driver = driver + } + + log.V(2).Infof("Initialize executor driver...") + + k.BirthCry() + exec.Init(k.driver) + + k.StartGarbageCollection() + + return k, pc, nil +} + +// kubelet decorator +type kubeletExecutor struct { + *kubelet.Kubelet + initialize sync.Once + driver bindings.ExecutorDriver + runProxy bool + proxyLogV int + proxyExec string + proxyLogfile string + proxyBindall bool + address util.IP + dockerClient dockertools.DockerInterface + hks hyperkube.Interface + kubeletFinished chan struct{} // closed once kubelet.Run() returns + executorDone <-chan struct{} // from KubeletExecutor.Done() + clientConfig *client.Config +} + +func (kl *kubeletExecutor) ListenAndServe(address net.IP, port uint, tlsOptions *kubelet.TLSOptions, enableDebuggingHandlers bool) { + // this func could be called many times, depending how often the HTTP server crashes, + // so only execute certain initialization procs once + kl.initialize.Do(func() { + if kl.runProxy { + go runtime.Until(kl.runProxyService, 5*time.Second, kl.executorDone) + } + go func() { + if _, err := kl.driver.Run(); err != nil { + log.Fatalf("executor driver failed: %v", err) + } + log.Info("executor Run completed") + }() + }) + log.Infof("Starting kubelet server...") + kubelet.ListenAndServeKubeletServer(kl, address, port, tlsOptions, enableDebuggingHandlers) +} + +// this function blocks as long as the proxy service is running; intended to be +// executed asynchronously. +func (kl *kubeletExecutor) runProxyService() { + + log.Infof("Starting proxy process...") + + const KM_PROXY = "proxy" //TODO(jdef) constant should be shared with km package + args := []string{} + + if kl.hks.FindServer(KM_PROXY) { + args = append(args, KM_PROXY) + log.V(1).Infof("attempting to using km proxy service") + } else if _, err := os.Stat(kl.proxyExec); os.IsNotExist(err) { + log.Errorf("failed to locate proxy executable at '%v' and km not present: %v", kl.proxyExec, err) + return + } + + bindAddress := "0.0.0.0" + if !kl.proxyBindall { + bindAddress = kl.address.String() + } + args = append(args, + fmt.Sprintf("--bind-address=%s", bindAddress), + fmt.Sprintf("--v=%d", kl.proxyLogV), + "--logtostderr=true", + ) + + // add client.Config args here. proxy still calls client.BindClientConfigFlags + appendStringArg := func(name, value string) { + if value != "" { + args = append(args, fmt.Sprintf("--%s=%s", name, value)) + } + } + appendStringArg("master", kl.clientConfig.Host) + /* TODO(jdef) move these flags to a config file pointed to by --kubeconfig + appendStringArg("api-version", kl.clientConfig.Version) + appendStringArg("client-certificate", kl.clientConfig.CertFile) + appendStringArg("client-key", kl.clientConfig.KeyFile) + appendStringArg("certificate-authority", kl.clientConfig.CAFile) + args = append(args, fmt.Sprintf("--insecure-skip-tls-verify=%t", kl.clientConfig.Insecure)) + */ + + log.Infof("Spawning process executable %s with args '%+v'", kl.proxyExec, args) + + cmd := exec.Command(kl.proxyExec, args...) + if _, err := cmd.StdoutPipe(); err != nil { + log.Fatal(err) + } + + proxylogs, err := cmd.StderrPipe() + if err != nil { + log.Fatal(err) + } + + //TODO(jdef) append instead of truncate? what if the disk is full? + logfile, err := os.Create(kl.proxyLogfile) + if err != nil { + log.Fatal(err) + } + defer logfile.Close() + + ch := make(chan struct{}) + go func() { + defer func() { + select { + case <-ch: + log.Infof("killing proxy process..") + if err = cmd.Process.Kill(); err != nil { + log.Errorf("failed to kill proxy process: %v", err) + } + default: + } + }() + + writer := bufio.NewWriter(logfile) + defer writer.Flush() + + <-ch + written, err := io.Copy(writer, proxylogs) + if err != nil { + log.Errorf("error writing data to proxy log: %v", err) + } + + log.Infof("wrote %d bytes to proxy log", written) + }() + + // if the proxy fails to start then we exit the executor, otherwise + // wait for the proxy process to end (and release resources after). + if err := cmd.Start(); err != nil { + log.Fatal(err) + } + close(ch) + if err := cmd.Wait(); err != nil { + log.Error(err) + } +} + +// runs the main kubelet loop, closing the kubeletFinished chan when the loop exits. +// never returns. +func (kl *kubeletExecutor) Run(updates <-chan kubelet.PodUpdate) { + defer func() { + close(kl.kubeletFinished) + util.HandleCrash() + log.Infoln("kubelet run terminated") //TODO(jdef) turn down verbosity + // important: never return! this is in our contract + select {} + }() + + // push updates through a closable pipe. when the executor indicates shutdown + // via Done() we want to stop the Kubelet from processing updates. + pipe := make(chan kubelet.PodUpdate) + go func() { + // closing pipe will cause our patched kubelet's syncLoop() to exit + defer close(pipe) + pipeLoop: + for { + select { + case <-kl.executorDone: + break pipeLoop + default: + select { + case u := <-updates: + select { + case pipe <- u: // noop + case <-kl.executorDone: + break pipeLoop + } + case <-kl.executorDone: + break pipeLoop + } + } + } + }() + + // we expect that Run() will complete after the pipe is closed and the + // kubelet's syncLoop() has finished processing its backlog, which hopefully + // will not take very long. Peeking into the future (current k8s master) it + // seems that the backlog has grown from 1 to 50 -- this may negatively impact + // us going forward, time will tell. + util.Until(func() { kl.Kubelet.Run(pipe) }, 0, kl.executorDone) + + //TODO(jdef) revisit this if/when executor failover lands + err := kl.SyncPods([]*api.Pod{}, nil, nil, time.Now()) + if err != nil { + log.Errorf("failed to cleanly remove all pods and associated state: %v", err) + } +} diff --git a/contrib/mesos/pkg/hyperkube/doc.go b/contrib/mesos/pkg/hyperkube/doc.go new file mode 100644 index 00000000000..c20e34402b3 --- /dev/null +++ b/contrib/mesos/pkg/hyperkube/doc.go @@ -0,0 +1,21 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package hyperkube facilitates the combination of multiple +// kubernetes-mesos components into a single binary form, providing a +// simple mechanism for intra-component discovery as per the original +// Kubernetes hyperkube package. +package hyperkube diff --git a/contrib/mesos/pkg/hyperkube/types.go b/contrib/mesos/pkg/hyperkube/types.go new file mode 100644 index 00000000000..e255f893f4f --- /dev/null +++ b/contrib/mesos/pkg/hyperkube/types.go @@ -0,0 +1,54 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package hyperkube + +import ( + "github.com/spf13/pflag" +) + +var ( + nilKube = &nilKubeType{} +) + +type Interface interface { + // FindServer will find a specific server named name. + FindServer(name string) bool + + // The executable name, used for help and soft-link invocation + Name() string + + // Flags returns a flagset for "global" flags. + Flags() *pflag.FlagSet +} + +type nilKubeType struct{} + +func (n *nilKubeType) FindServer(_ string) bool { + return false +} + +func (n *nilKubeType) Name() string { + return "" +} + +func (n *nilKubeType) Flags() *pflag.FlagSet { + return nil +} + +func Nil() Interface { + return nilKube +} diff --git a/contrib/mesos/pkg/offers/doc.go b/contrib/mesos/pkg/offers/doc.go new file mode 100644 index 00000000000..03a76f3a3cb --- /dev/null +++ b/contrib/mesos/pkg/offers/doc.go @@ -0,0 +1,18 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package offers contains code that manages Mesos offers. +package offers diff --git a/contrib/mesos/pkg/offers/metrics/doc.go b/contrib/mesos/pkg/offers/metrics/doc.go new file mode 100644 index 00000000000..9660dff774c --- /dev/null +++ b/contrib/mesos/pkg/offers/metrics/doc.go @@ -0,0 +1,19 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package metrics defines and exposes instrumentation metrics related to +// Mesos offers. +package metrics diff --git a/contrib/mesos/pkg/offers/metrics/metrics.go b/contrib/mesos/pkg/offers/metrics/metrics.go new file mode 100644 index 00000000000..dbebf2f42ea --- /dev/null +++ b/contrib/mesos/pkg/offers/metrics/metrics.go @@ -0,0 +1,89 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package metrics + +import ( + "sync" + "time" + + "github.com/prometheus/client_golang/prometheus" +) + +const ( + offerSubsystem = "mesos_offers" +) + +type OfferDeclinedReason string + +const ( + OfferExpired = OfferDeclinedReason("expired") + OfferRescinded = OfferDeclinedReason("rescinded") + OfferCompat = OfferDeclinedReason("compat") +) + +var ( + OffersReceived = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Subsystem: offerSubsystem, + Name: "received", + Help: "Counter of offers received from Mesos broken out by slave host.", + }, + []string{"hostname"}, + ) + + OffersDeclined = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Subsystem: offerSubsystem, + Name: "declined", + Help: "Counter of offers declined by the framework broken out by slave host.", + }, + []string{"hostname", "reason"}, + ) + + OffersAcquired = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Subsystem: offerSubsystem, + Name: "acquired", + Help: "Counter of offers acquired for task launch broken out by slave host.", + }, + []string{"hostname"}, + ) + + OffersReleased = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Subsystem: offerSubsystem, + Name: "released", + Help: "Counter of previously-acquired offers later released, broken out by slave host.", + }, + []string{"hostname"}, + ) +) + +var registerMetrics sync.Once + +func Register() { + registerMetrics.Do(func() { + prometheus.MustRegister(OffersReceived) + prometheus.MustRegister(OffersDeclined) + prometheus.MustRegister(OffersAcquired) + prometheus.MustRegister(OffersReleased) + }) +} + +func InMicroseconds(d time.Duration) float64 { + return float64(d.Nanoseconds() / time.Microsecond.Nanoseconds()) +} diff --git a/contrib/mesos/pkg/offers/offers.go b/contrib/mesos/pkg/offers/offers.go new file mode 100644 index 00000000000..0df507d593d --- /dev/null +++ b/contrib/mesos/pkg/offers/offers.go @@ -0,0 +1,570 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package offers + +import ( + "fmt" + "reflect" + "sync" + "sync/atomic" + "time" + + "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/offers/metrics" + "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/proc" + "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/queue" + "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/runtime" + "github.com/GoogleCloudPlatform/kubernetes/pkg/client/cache" + "github.com/GoogleCloudPlatform/kubernetes/pkg/util" + log "github.com/golang/glog" + mesos "github.com/mesos/mesos-go/mesosproto" +) + +const ( + offerListenerMaxAge = 12 // max number of times we'll attempt to fit an offer to a listener before requiring them to re-register themselves + offerIdCacheTTL = 1 * time.Second // determines expiration of cached offer ids, used in listener notification + deferredDeclineTtlFactor = 2 // this factor, multiplied by the offer ttl, determines how long to wait before attempting to decline previously claimed offers that were subsequently deleted, then released. see offerStorage.Delete + notifyListenersDelay = 0 // delay between offer listener notification attempts +) + +type Filter func(*mesos.Offer) bool + +type Registry interface { + // Initialize the instance, spawning necessary housekeeping go routines. + Init(<-chan struct{}) + + // Add offers to this registry, rejecting those that are deemed incompatible. + Add([]*mesos.Offer) + + // Listen for arriving offers that are acceptable to the filter, sending + // a signal on (by closing) the returned channel. A listener will only + // ever be notified once, if at all. + Listen(id string, f Filter) <-chan struct{} + + // invoked when offers are rescinded or expired + Delete(string, metrics.OfferDeclinedReason) + + // when true, returns the offer that's registered for the given ID + Get(offerId string) (Perishable, bool) + + // iterate through non-expired offers in this registry + Walk(Walker) error + + // invalidate one or all (when offerId="") offers; offers are not declined, + // but are simply flagged as expired in the offer history + Invalidate(offerId string) + + // invalidate all offers associated with the slave identified by slaveId. + InvalidateForSlave(slaveId string) +} + +// callback that is invoked during a walk through a series of live offers, +// returning with stop=true (or err != nil) if the walk should stop permaturely. +type Walker func(offer Perishable) (stop bool, err error) + +type RegistryConfig struct { + DeclineOffer func(offerId string) <-chan error // tell Mesos that we're declining the offer + Compat func(*mesos.Offer) bool // returns true if offer is compatible; incompatible offers are declined + TTL time.Duration // determines a perishable offer's expiration deadline: now+ttl + LingerTTL time.Duration // if zero, offers will not linger in the FIFO past their expiration deadline + ListenerDelay time.Duration // specifies the sleep time between offer listener notifications +} + +type offerStorage struct { + RegistryConfig + offers *cache.FIFO // collection of Perishable, both live and expired + listeners *queue.DelayFIFO // collection of *offerListener + delayed *queue.DelayQueue // deadline-oriented offer-event queue + slaves *slaveStorage // slave to offer mappings +} + +type liveOffer struct { + *mesos.Offer + expiration time.Time + acquired int32 // 1 = acquired, 0 = free +} + +type expiredOffer struct { + offerSpec + deadline time.Time +} + +// subset of mesos.OfferInfo useful for recordkeeping +type offerSpec struct { + id string + hostname string +} + +// offers that may perish (all of them?) implement this interface. +// callers may expect to access these funcs concurrently so implementations +// must provide their own form of synchronization around mutable state. +type Perishable interface { + // returns true if this offer has expired + HasExpired() bool + // if not yet expired, return mesos offer details; otherwise nil + Details() *mesos.Offer + // mark this offer as acquired, returning true if it was previously unacquired. thread-safe. + Acquire() bool + // mark this offer as un-acquired. thread-safe. + Release() + // expire or delete this offer from storage + age(s *offerStorage) + // return a unique identifier for this offer + Id() string + // return the slave host for this offer + Host() string + addTo(*queue.DelayQueue) +} + +func (e *expiredOffer) addTo(q *queue.DelayQueue) { + q.Add(e) +} + +func (e *expiredOffer) Id() string { + return e.id +} + +func (e *expiredOffer) Host() string { + return e.hostname +} + +func (e *expiredOffer) HasExpired() bool { + return true +} + +func (e *expiredOffer) Details() *mesos.Offer { + return nil +} + +func (e *expiredOffer) Acquire() bool { + return false +} + +func (e *expiredOffer) Release() {} + +func (e *expiredOffer) age(s *offerStorage) { + log.V(3).Infof("Delete lingering offer: %v", e.id) + s.offers.Delete(e) + s.slaves.deleteOffer(e.id) +} + +// return the time left to linger +func (e *expiredOffer) GetDelay() time.Duration { + return e.deadline.Sub(time.Now()) +} + +func (to *liveOffer) HasExpired() bool { + return time.Now().After(to.expiration) +} + +func (to *liveOffer) Details() *mesos.Offer { + return to.Offer +} + +func (to *liveOffer) Acquire() (acquired bool) { + if acquired = atomic.CompareAndSwapInt32(&to.acquired, 0, 1); acquired { + metrics.OffersAcquired.WithLabelValues(to.Host()).Inc() + } + return +} + +func (to *liveOffer) Release() { + if released := atomic.CompareAndSwapInt32(&to.acquired, 1, 0); released { + metrics.OffersReleased.WithLabelValues(to.Host()).Inc() + } +} + +func (to *liveOffer) age(s *offerStorage) { + s.Delete(to.Id(), metrics.OfferExpired) +} + +func (to *liveOffer) Id() string { + return to.Offer.Id.GetValue() +} + +func (to *liveOffer) Host() string { + return to.Offer.GetHostname() +} + +func (to *liveOffer) addTo(q *queue.DelayQueue) { + q.Add(to) +} + +// return the time remaining before the offer expires +func (to *liveOffer) GetDelay() time.Duration { + return to.expiration.Sub(time.Now()) +} + +func CreateRegistry(c RegistryConfig) Registry { + metrics.Register() + return &offerStorage{ + RegistryConfig: c, + offers: cache.NewFIFO(cache.KeyFunc(func(v interface{}) (string, error) { + if perishable, ok := v.(Perishable); !ok { + return "", fmt.Errorf("expected perishable offer, not '%+v'", v) + } else { + return perishable.Id(), nil + } + })), + listeners: queue.NewDelayFIFO(), + delayed: queue.NewDelayQueue(), + slaves: newSlaveStorage(), + } +} + +func (s *offerStorage) declineOffer(offerId, hostname string, reason metrics.OfferDeclinedReason) { + //TODO(jdef) might be nice to spec an abort chan here + runtime.Signal(proc.OnError(s.DeclineOffer(offerId), func(err error) { + log.Warningf("decline failed for offer id %v: %v", offerId, err) + }, nil)).Then(func() { + metrics.OffersDeclined.WithLabelValues(hostname, string(reason)).Inc() + }) +} + +func (s *offerStorage) Add(offers []*mesos.Offer) { + now := time.Now() + for _, offer := range offers { + if !s.Compat(offer) { + //TODO(jdef) would be nice to batch these up + offerId := offer.Id.GetValue() + log.V(3).Infof("Declining incompatible offer %v", offerId) + s.declineOffer(offerId, offer.GetHostname(), metrics.OfferCompat) + return + } + timed := &liveOffer{ + Offer: offer, + expiration: now.Add(s.TTL), + acquired: 0, + } + log.V(3).Infof("Receiving offer %v", timed.Id()) + s.offers.Add(timed) + s.delayed.Add(timed) + s.slaves.add(offer.SlaveId.GetValue(), timed.Id()) + metrics.OffersReceived.WithLabelValues(timed.Host()).Inc() + } +} + +// delete an offer from storage, implicitly expires the offer +func (s *offerStorage) Delete(offerId string, reason metrics.OfferDeclinedReason) { + if offer, ok := s.Get(offerId); ok { + log.V(3).Infof("Deleting offer %v", offerId) + // attempt to block others from consuming the offer. if it's already been + // claimed and is not yet lingering then don't decline it - just mark it as + // expired in the history: allow a prior claimant to attempt to launch with it + notYetClaimed := offer.Acquire() + if offer.Details() != nil { + if notYetClaimed { + log.V(3).Infof("Declining offer %v", offerId) + s.declineOffer(offerId, offer.Host(), reason) + } else { + // some pod has acquired this and may attempt to launch a task with it + // failed schedule/launch attempts are requried to Release() any claims on the offer + + // TODO(jdef): not sure what a good value is here. the goal is to provide a + // launchTasks (driver) operation enough time to complete so that we don't end + // up declining an offer that we're actually attempting to use. + time.AfterFunc(deferredDeclineTtlFactor*s.TTL, func() { + // at this point the offer is in one of five states: + // a) permanently deleted: expired due to timeout + // b) permanently deleted: expired due to having been rescinded + // c) lingering: expired due to timeout + // d) lingering: expired due to having been rescinded + // e) claimed: task launched and it using resources from this offer + // we want to **avoid** declining an offer that's claimed: attempt to acquire + if offer.Acquire() { + // previously claimed offer was released, perhaps due to a launch + // failure, so we should attempt to decline + log.V(3).Infof("attempting to decline (previously claimed) offer %v", offerId) + s.declineOffer(offerId, offer.Host(), reason) + } + }) + } + } + s.expireOffer(offer) + } // else, ignore offers not in the history +} + +func (s *offerStorage) InvalidateForSlave(slaveId string) { + offerIds := s.slaves.deleteSlave(slaveId) + for oid := range offerIds { + s.invalidateOne(oid) + } +} + +// if offerId == "" then expire all known, live offers, otherwise only the offer indicated +func (s *offerStorage) Invalidate(offerId string) { + if offerId != "" { + s.invalidateOne(offerId) + return + } + obj := s.offers.List() + for _, o := range obj { + offer, ok := o.(Perishable) + if !ok { + log.Errorf("Expected perishable offer, not %v", o) + continue + } + offer.Acquire() // attempt to block others from using it + s.expireOffer(offer) + // don't decline, we already know that it's an invalid offer + } +} + +func (s *offerStorage) invalidateOne(offerId string) { + if offer, ok := s.Get(offerId); ok { + offer.Acquire() // attempt to block others from using it + s.expireOffer(offer) + // don't decline, we already know that it's an invalid offer + } +} + +// Walk the collection of offers. The walk stops either as indicated by the +// Walker or when the end of the offer list is reached. Expired offers are +// never passed to a Walker. +func (s *offerStorage) Walk(w Walker) error { + for _, v := range s.offers.List() { + offer, ok := v.(Perishable) + if !ok { + // offer disappeared... + continue + } + if offer.HasExpired() { + // never pass expired offers to walkers + continue + } + if stop, err := w(offer); err != nil { + return err + } else if stop { + return nil + } + } + return nil +} + +func Expired(offerId, hostname string, ttl time.Duration) *expiredOffer { + return &expiredOffer{offerSpec{id: offerId, hostname: hostname}, time.Now().Add(ttl)} +} + +func (s *offerStorage) expireOffer(offer Perishable) { + // the offer may or may not be expired due to TTL so check for details + // since that's a more reliable determinant of lingering status + if details := offer.Details(); details != nil { + // recently expired, should linger + offerId := details.Id.GetValue() + log.V(3).Infof("Expiring offer %v", offerId) + if s.LingerTTL > 0 { + log.V(3).Infof("offer will linger: %v", offerId) + expired := Expired(offerId, offer.Host(), s.LingerTTL) + s.offers.Update(expired) + s.delayed.Add(expired) + } else { + log.V(3).Infof("Permanently deleting offer %v", offerId) + s.offers.Delete(offerId) + s.slaves.deleteOffer(offerId) + } + } // else, it's still lingering... +} + +func (s *offerStorage) Get(id string) (Perishable, bool) { + if obj, ok, _ := s.offers.GetByKey(id); !ok { + return nil, false + } else { + to, ok := obj.(Perishable) + if !ok { + log.Errorf("invalid offer object in fifo '%v'", obj) + } + return to, ok + } +} + +type offerListener struct { + id string + accepts Filter + notify chan<- struct{} + age int + deadline time.Time + sawVersion uint64 +} + +func (l *offerListener) GetUID() string { + return l.id +} + +func (l *offerListener) Deadline() (time.Time, bool) { + return l.deadline, true +} + +// register a listener for new offers, whom we'll notify upon receiving such. +// notification is delivered in the form of closing the channel, nothing is ever sent. +func (s *offerStorage) Listen(id string, f Filter) <-chan struct{} { + if f == nil { + return nil + } + ch := make(chan struct{}) + listen := &offerListener{ + id: id, + accepts: f, + notify: ch, + deadline: time.Now().Add(s.ListenerDelay), + } + log.V(3).Infof("Registering offer listener %s", listen.id) + s.listeners.Offer(listen, queue.ReplaceExisting) + return ch +} + +func (s *offerStorage) ageOffers() { + offer, ok := s.delayed.Pop().(Perishable) + if !ok { + log.Errorf("Expected Perishable, not %v", offer) + return + } + if details := offer.Details(); details != nil && !offer.HasExpired() { + // live offer has not expired yet: timed out early + // FWIW: early timeouts are more frequent when GOMAXPROCS is > 1 + offer.addTo(s.delayed) + } else { + offer.age(s) + } +} + +func (s *offerStorage) nextListener() *offerListener { + obj := s.listeners.Pop() + if listen, ok := obj.(*offerListener); !ok { + //programming error + panic(fmt.Sprintf("unexpected listener object %v", obj)) + } else { + return listen + } +} + +// notify listeners if we find an acceptable offer for them. listeners +// are garbage collected after a certain age (see offerListenerMaxAge). +// ids lists offer IDs that are retrievable from offer storage. +func (s *offerStorage) notifyListeners(ids func() (util.StringSet, uint64)) { + listener := s.nextListener() // blocking + + offerIds, version := ids() + if listener.sawVersion == version { + // no changes to offer list, avoid growing older - just wait for new offers to arrive + listener.deadline = time.Now().Add(s.ListenerDelay) + s.listeners.Offer(listener, queue.KeepExisting) + return + } + listener.sawVersion = version + + // notify if we find an acceptable offer + for id := range offerIds { + if offer, ok := s.Get(id); !ok || offer.HasExpired() { + continue + } else if listener.accepts(offer.Details()) { + log.V(3).Infof("Notifying offer listener %s", listener.id) + close(listener.notify) + return + } + } + + // no interesting offers found, re-queue the listener + listener.age++ + if listener.age < offerListenerMaxAge { + listener.deadline = time.Now().Add(s.ListenerDelay) + s.listeners.Offer(listener, queue.KeepExisting) + } else { + // garbage collection is as simple as not re-adding the listener to the queue + log.V(3).Infof("garbage collecting offer listener %s", listener.id) + } +} + +func (s *offerStorage) Init(done <-chan struct{}) { + // zero delay, reap offers as soon as they expire + go runtime.Until(s.ageOffers, 0, done) + + // cached offer ids for the purposes of listener notification + idCache := &stringsCache{ + refill: func() util.StringSet { + result := util.NewStringSet() + for _, v := range s.offers.List() { + if offer, ok := v.(Perishable); ok { + result.Insert(offer.Id()) + } + } + return result + }, + ttl: offerIdCacheTTL, + } + + go runtime.Until(func() { s.notifyListeners(idCache.Strings) }, notifyListenersDelay, done) +} + +type stringsCache struct { + expiresAt time.Time + cached util.StringSet + ttl time.Duration + refill func() util.StringSet + version uint64 +} + +// not thread-safe +func (c *stringsCache) Strings() (util.StringSet, uint64) { + now := time.Now() + if c.expiresAt.Before(now) { + old := c.cached + c.cached = c.refill() + c.expiresAt = now.Add(c.ttl) + if !reflect.DeepEqual(old, c.cached) { + c.version++ + } + } + return c.cached, c.version +} + +type slaveStorage struct { + sync.Mutex + index map[string]string // map offerId to slaveId +} + +func newSlaveStorage() *slaveStorage { + return &slaveStorage{ + index: make(map[string]string), + } +} + +// create a mapping between a slave and an offer +func (self *slaveStorage) add(slaveId, offerId string) { + self.Lock() + defer self.Unlock() + self.index[offerId] = slaveId +} + +// delete the slave-offer mappings for slaveId, returns the IDs of the offers that were unmapped +func (self *slaveStorage) deleteSlave(slaveId string) util.StringSet { + offerIds := util.NewStringSet() + self.Lock() + defer self.Unlock() + for oid, sid := range self.index { + if sid == slaveId { + offerIds.Insert(oid) + delete(self.index, oid) + } + } + return offerIds +} + +// delete the slave-offer mappings for offerId +func (self *slaveStorage) deleteOffer(offerId string) { + self.Lock() + defer self.Unlock() + delete(self.index, offerId) +} diff --git a/contrib/mesos/pkg/offers/offers_test.go b/contrib/mesos/pkg/offers/offers_test.go new file mode 100644 index 00000000000..5b44eee5afc --- /dev/null +++ b/contrib/mesos/pkg/offers/offers_test.go @@ -0,0 +1,391 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package offers + +import ( + "errors" + "sync/atomic" + "testing" + "time" + + "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/proc" + mesos "github.com/mesos/mesos-go/mesosproto" + util "github.com/mesos/mesos-go/mesosutil" +) + +func TestExpiredOffer(t *testing.T) { + t.Parallel() + + ttl := 2 * time.Second + o := Expired("test", "testhost", ttl) + + if o.Id() != "test" { + t.Error("expiredOffer does not return its Id") + } + if o.Host() != "testhost" { + t.Error("expiredOffer does not return its hostname") + } + if o.HasExpired() != true { + t.Error("expiredOffer is not expired") + } + if o.Details() != nil { + t.Error("expiredOffer does not return nil Details") + } + if o.Acquire() != false { + t.Error("expiredOffer must not be able to be acquired") + } + if delay := o.GetDelay(); !(0 < delay && delay <= ttl) { + t.Error("expiredOffer does not return a valid deadline") + } +} // TestExpiredOffer + +func TestTimedOffer(t *testing.T) { + t.Parallel() + + ttl := 2 * time.Second + now := time.Now() + o := &liveOffer{nil, now.Add(ttl), 0} + + if o.HasExpired() { + t.Errorf("offer ttl was %v and should not have expired yet", ttl) + } + if !o.Acquire() { + t.Fatal("1st acquisition of offer failed") + } + o.Release() + if !o.Acquire() { + t.Fatal("2nd acquisition of offer failed") + } + if o.Acquire() { + t.Fatal("3rd acquisition of offer passed but prior claim was not released") + } + o.Release() + if !o.Acquire() { + t.Fatal("4th acquisition of offer failed") + } + o.Release() + time.Sleep(ttl) + if !o.HasExpired() { + t.Fatal("offer not expired after ttl passed") + } + if !o.Acquire() { + t.Fatal("5th acquisition of offer failed; should not be tied to expiration") + } + if o.Acquire() { + t.Fatal("6th acquisition of offer succeeded; should already be acquired") + } +} // TestTimedOffer + +func TestOfferStorage(t *testing.T) { + ttl := time.Second / 4 + var declinedNum int32 + getDeclinedNum := func() int32 { return atomic.LoadInt32(&declinedNum) } + config := RegistryConfig{ + DeclineOffer: func(offerId string) <-chan error { + atomic.AddInt32(&declinedNum, 1) + return proc.ErrorChan(nil) + }, + Compat: func(o *mesos.Offer) bool { + return o.Hostname == nil || *o.Hostname != "incompatiblehost" + }, + TTL: ttl, + LingerTTL: 2 * ttl, + } + storage := CreateRegistry(config) + + done := make(chan struct{}) + storage.Init(done) + + // Add offer + id := util.NewOfferID("foo") + o := &mesos.Offer{Id: id} + storage.Add([]*mesos.Offer{o}) + + // Added offer should be in the storage + if obj, ok := storage.Get(id.GetValue()); obj == nil || !ok { + t.Error("offer not added") + } + if obj, _ := storage.Get(id.GetValue()); obj.Details() != o { + t.Error("added offer differs from returned offer") + } + + // Not-added offer is not in storage + if obj, ok := storage.Get("bar"); obj != nil || ok { + t.Error("offer bar should not exist in storage") + } + + // Deleted offer lingers in storage, is acquired and declined + offer, _ := storage.Get(id.GetValue()) + declinedNumBefore := getDeclinedNum() + storage.Delete(id.GetValue(), "deleted for test") + if obj, _ := storage.Get(id.GetValue()); obj == nil { + t.Error("deleted offer is not lingering") + } + if obj, _ := storage.Get(id.GetValue()); !obj.HasExpired() { + t.Error("deleted offer is no expired") + } + if ok := offer.Acquire(); ok { + t.Error("deleted offer can be acquired") + } + if getDeclinedNum() <= declinedNumBefore { + t.Error("deleted offer was not declined") + } + + // Acquired offer is only declined after 2*ttl + id = util.NewOfferID("foo2") + o = &mesos.Offer{Id: id} + storage.Add([]*mesos.Offer{o}) + offer, _ = storage.Get(id.GetValue()) + declinedNumBefore = getDeclinedNum() + offer.Acquire() + storage.Delete(id.GetValue(), "deleted for test") + if getDeclinedNum() > declinedNumBefore { + t.Error("acquired offer is declined") + } + + offer.Release() + time.Sleep(3 * ttl) + if getDeclinedNum() <= declinedNumBefore { + t.Error("released offer is not declined after 2*ttl") + } + + // Added offer should be expired after ttl, but lingering + id = util.NewOfferID("foo3") + o = &mesos.Offer{Id: id} + storage.Add([]*mesos.Offer{o}) + + time.Sleep(2 * ttl) + obj, ok := storage.Get(id.GetValue()) + if obj == nil || !ok { + t.Error("offer not lingering after ttl") + } + if !obj.HasExpired() { + t.Error("offer is not expired after ttl") + } + + // Should be deleted when waiting longer than LingerTTL + time.Sleep(2 * ttl) + if obj, ok := storage.Get(id.GetValue()); obj != nil || ok { + t.Error("offer not deleted after LingerTTL") + } + + // Incompatible offer is declined + id = util.NewOfferID("foo4") + incompatibleHostname := "incompatiblehost" + o = &mesos.Offer{Id: id, Hostname: &incompatibleHostname} + declinedNumBefore = getDeclinedNum() + storage.Add([]*mesos.Offer{o}) + if obj, ok := storage.Get(id.GetValue()); obj != nil || ok { + t.Error("incompatible offer not rejected") + } + if getDeclinedNum() <= declinedNumBefore { + t.Error("incompatible offer is not declined") + } + + // Invalidated offer are not declined, but expired + id = util.NewOfferID("foo5") + o = &mesos.Offer{Id: id} + storage.Add([]*mesos.Offer{o}) + offer, _ = storage.Get(id.GetValue()) + declinedNumBefore = getDeclinedNum() + storage.Invalidate(id.GetValue()) + if obj, _ := storage.Get(id.GetValue()); !obj.HasExpired() { + t.Error("invalidated offer is not expired") + } + if getDeclinedNum() > declinedNumBefore { + t.Error("invalidated offer is declined") + } + if ok := offer.Acquire(); ok { + t.Error("invalidated offer can be acquired") + } + + // Invalidate "" will invalidate all offers + id = util.NewOfferID("foo6") + o = &mesos.Offer{Id: id} + storage.Add([]*mesos.Offer{o}) + id2 := util.NewOfferID("foo7") + o2 := &mesos.Offer{Id: id2} + storage.Add([]*mesos.Offer{o2}) + storage.Invalidate("") + if obj, _ := storage.Get(id.GetValue()); !obj.HasExpired() { + t.Error("invalidated offer is not expired") + } + if obj2, _ := storage.Get(id2.GetValue()); !obj2.HasExpired() { + t.Error("invalidated offer is not expired") + } + + // InvalidateForSlave invalides all offers for that slave, but only those + id = util.NewOfferID("foo8") + slaveId := util.NewSlaveID("test-slave") + o = &mesos.Offer{Id: id, SlaveId: slaveId} + storage.Add([]*mesos.Offer{o}) + id2 = util.NewOfferID("foo9") + o2 = &mesos.Offer{Id: id2} + storage.Add([]*mesos.Offer{o2}) + storage.InvalidateForSlave(slaveId.GetValue()) + if obj, _ := storage.Get(id.GetValue()); !obj.HasExpired() { + t.Error("invalidated offer for test-slave is not expired") + } + if obj2, _ := storage.Get(id2.GetValue()); obj2.HasExpired() { + t.Error("invalidated offer another slave is expired") + } + + close(done) +} // TestOfferStorage + +func TestListen(t *testing.T) { + ttl := time.Second / 4 + config := RegistryConfig{ + DeclineOffer: func(offerId string) <-chan error { + return proc.ErrorChan(nil) + }, + Compat: func(o *mesos.Offer) bool { + return true + }, + TTL: ttl, + ListenerDelay: ttl / 2, + } + storage := CreateRegistry(config) + + done := make(chan struct{}) + storage.Init(done) + + // Create two listeners with a hostname filter + hostname1 := "hostname1" + hostname2 := "hostname2" + listener1 := storage.Listen("listener1", func(offer *mesos.Offer) bool { + return offer.GetHostname() == hostname1 + }) + listener2 := storage.Listen("listener2", func(offer *mesos.Offer) bool { + return offer.GetHostname() == hostname2 + }) + + // Add hostname1 offer + id := util.NewOfferID("foo") + o := &mesos.Offer{Id: id, Hostname: &hostname1} + storage.Add([]*mesos.Offer{o}) + + // listener1 is notified by closing channel + select { + case _, more := <-listener1: + if more { + t.Error("listener1 is not closed") + } + } + + // listener2 is not notified within ttl + select { + case <-listener2: + t.Error("listener2 is notified") + case <-time.After(ttl): + } + + close(done) +} // TestListen + +func TestWalk(t *testing.T) { + t.Parallel() + config := RegistryConfig{ + DeclineOffer: func(offerId string) <-chan error { + return proc.ErrorChan(nil) + }, + TTL: 0 * time.Second, + LingerTTL: 0 * time.Second, + ListenerDelay: 0 * time.Second, + } + storage := CreateRegistry(config) + acceptedOfferId := "" + walked := 0 + walker1 := func(p Perishable) (bool, error) { + walked++ + if p.Acquire() { + acceptedOfferId = p.Details().Id.GetValue() + return true, nil + } + return false, nil + } + // sanity check + err := storage.Walk(walker1) + if err != nil { + t.Fatalf("received impossible error %v", err) + } + if walked != 0 { + t.Fatal("walked empty storage") + } + if acceptedOfferId != "" { + t.Fatal("somehow found an offer when registry was empty") + } + impl, ok := storage.(*offerStorage) + if !ok { + t.Fatal("unexpected offer storage impl") + } + // single offer + ttl := 2 * time.Second + now := time.Now() + o := &liveOffer{&mesos.Offer{Id: util.NewOfferID("foo")}, now.Add(ttl), 0} + + impl.offers.Add(o) + err = storage.Walk(walker1) + if err != nil { + t.Fatalf("received impossible error %v", err) + } + if walked != 1 { + t.Fatalf("walk count %d", walked) + } + if acceptedOfferId != "foo" { + t.Fatalf("found offer %v", acceptedOfferId) + } + + acceptedOfferId = "" + err = storage.Walk(walker1) + if err != nil { + t.Fatalf("received impossible error %v", err) + } + if walked != 2 { + t.Fatalf("walk count %d", walked) + } + if acceptedOfferId != "" { + t.Fatalf("found offer %v", acceptedOfferId) + } + + walker2 := func(p Perishable) (bool, error) { + walked++ + return true, nil + } + err = storage.Walk(walker2) + if err != nil { + t.Fatalf("received impossible error %v", err) + } + if walked != 3 { + t.Fatalf("walk count %d", walked) + } + if acceptedOfferId != "" { + t.Fatalf("found offer %v", acceptedOfferId) + } + + walker3 := func(p Perishable) (bool, error) { + walked++ + return true, errors.New("baz") + } + err = storage.Walk(walker3) + if err == nil { + t.Fatal("expected error") + } + if walked != 4 { + t.Fatalf("walk count %d", walked) + } +} diff --git a/contrib/mesos/pkg/proc/doc.go b/contrib/mesos/pkg/proc/doc.go new file mode 100644 index 00000000000..ec3b4e0f80f --- /dev/null +++ b/contrib/mesos/pkg/proc/doc.go @@ -0,0 +1,19 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package proc provides opinionated utilities for processing background +// operations and future errors, somewhat inspired by libprocess. +package proc diff --git a/contrib/mesos/pkg/proc/errors.go b/contrib/mesos/pkg/proc/errors.go new file mode 100644 index 00000000000..c7fe0f442e6 --- /dev/null +++ b/contrib/mesos/pkg/proc/errors.go @@ -0,0 +1,34 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package proc + +import ( + "errors" +) + +var ( + errProcessTerminated = errors.New("cannot execute action because process has terminated") + errIllegalState = errors.New("illegal state, cannot execute action") +) + +func IsProcessTerminated(err error) bool { + return err == errProcessTerminated +} + +func IsIllegalState(err error) bool { + return err == errIllegalState +} diff --git a/contrib/mesos/pkg/proc/proc.go b/contrib/mesos/pkg/proc/proc.go new file mode 100644 index 00000000000..159e523961f --- /dev/null +++ b/contrib/mesos/pkg/proc/proc.go @@ -0,0 +1,377 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package proc + +import ( + "fmt" + "sync" + "sync/atomic" + "time" + + "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/runtime" + log "github.com/golang/glog" +) + +const ( + // if the action processor crashes (if some Action panics) then we + // wait this long before spinning up the action processor again. + defaultActionHandlerCrashDelay = 100 * time.Millisecond + + // how many actions we can store in the backlog + defaultActionQueueDepth = 1024 +) + +type procImpl struct { + Config + backlog chan Action // action queue + terminate chan struct{} // signaled via close() + wg sync.WaitGroup // End() terminates when the wait is over + done runtime.Signal + state *stateType + pid uint32 + writeLock sync.Mutex // avoid data race between write and close of backlog + changed *sync.Cond // wait/signal for backlog changes + engine DoerFunc // isolated this for easier unit testing later on + running chan struct{} // closes once event loop processing starts + dead chan struct{} // closes upon completion of process termination +} + +type Config struct { + // cooldown period in between deferred action crashes + actionHandlerCrashDelay time.Duration + + // determines the size of the deferred action backlog + actionQueueDepth uint32 +} + +var ( + defaultConfig = Config{ + actionHandlerCrashDelay: defaultActionHandlerCrashDelay, + actionQueueDepth: defaultActionQueueDepth, + } + pid uint32 + closedErrChan <-chan error +) + +func init() { + ch := make(chan error) + close(ch) + closedErrChan = ch +} + +func New() Process { + return newConfigured(defaultConfig) +} + +func newConfigured(config Config) Process { + state := stateNew + pi := &procImpl{ + Config: config, + backlog: make(chan Action, config.actionQueueDepth), + terminate: make(chan struct{}), + state: &state, + pid: atomic.AddUint32(&pid, 1), + running: make(chan struct{}), + dead: make(chan struct{}), + } + pi.engine = DoerFunc(pi.doLater) + pi.changed = sync.NewCond(&pi.writeLock) + pi.wg.Add(1) // symmetrical to wg.Done() in End() + pi.done = pi.begin() + return pi +} + +// returns a chan that closes upon termination of the action processing loop +func (self *procImpl) Done() <-chan struct{} { + return self.done +} + +func (self *procImpl) Running() <-chan struct{} { + return self.running +} + +func (self *procImpl) begin() runtime.Signal { + if !self.state.transition(stateNew, stateRunning) { + panic(fmt.Errorf("failed to transition from New to Idle state")) + } + defer log.V(2).Infof("started process %d", self.pid) + var entered runtime.Latch + // execute actions on the backlog chan + return runtime.After(func() { + runtime.Until(func() { + if entered.Acquire() { + close(self.running) + self.wg.Add(1) + } + for action := range self.backlog { + select { + case <-self.terminate: + return + default: + // signal to indicate there's room in the backlog now + self.changed.Broadcast() + // rely on Until to handle action panics + action() + } + } + }, self.actionHandlerCrashDelay, self.terminate) + }).Then(func() { + log.V(2).Infof("finished processing action backlog for process %d", self.pid) + if !entered.Acquire() { + self.wg.Done() + } + }) +} + +// execute some action in the context of the current process. Actions +// executed via this func are to be executed in a concurrency-safe manner: +// no two actions should execute at the same time. invocations of this func +// should not block for very long, unless the action backlog is full or the +// process is terminating. +// returns errProcessTerminated if the process already ended. +func (self *procImpl) doLater(deferredAction Action) (err <-chan error) { + a := Action(func() { + self.wg.Add(1) + defer self.wg.Done() + deferredAction() + }) + + scheduled := false + self.writeLock.Lock() + defer self.writeLock.Unlock() + + for err == nil && !scheduled { + switch s := self.state.get(); s { + case stateRunning: + select { + case self.backlog <- a: + scheduled = true + default: + self.changed.Wait() + } + case stateTerminal: + err = ErrorChan(errProcessTerminated) + default: + err = ErrorChan(errIllegalState) + } + } + return +} + +// implementation of Doer interface, schedules some action to be executed via +// the current execution engine +func (self *procImpl) Do(a Action) <-chan error { + return self.engine(a) +} + +// spawn a goroutine that waits for an error. if a non-nil error is read from the +// channel then the handler func is invoked, otherwise (nil error or closed chan) +// the handler is skipped. if a nil handler is specified then it's not invoked. +// the signal chan that's returned closes once the error process logic (and handler, +// if any) has completed. +func OnError(ch <-chan error, f func(error), abort <-chan struct{}) <-chan struct{} { + return runtime.After(func() { + if ch == nil { + return + } + select { + case err, ok := <-ch: + if ok && err != nil && f != nil { + f(err) + } + case <-abort: + if f != nil { + f(errProcessTerminated) + } + } + }) +} + +func (self *procImpl) OnError(ch <-chan error, f func(error)) <-chan struct{} { + return OnError(ch, f, self.Done()) +} + +func (self *procImpl) flush() { + log.V(2).Infof("flushing action backlog for process %d", self.pid) + i := 0 + //TODO: replace with `for range self.backlog` once Go 1.3 support is dropped + for { + _, open := <-self.backlog + if !open { + break + } + i++ + } + log.V(2).Infof("flushed %d backlog actions for process %d", i, self.pid) +} + +func (self *procImpl) End() <-chan struct{} { + if self.state.transitionTo(stateTerminal, stateTerminal) { + go func() { + defer close(self.dead) + self.writeLock.Lock() + defer self.writeLock.Unlock() + + log.V(2).Infof("terminating process %d", self.pid) + + close(self.backlog) + close(self.terminate) + self.wg.Done() + self.changed.Broadcast() + + log.V(2).Infof("waiting for deferred actions to complete") + + // wait for all pending actions to complete, then flush the backlog + self.wg.Wait() + self.flush() + }() + } + return self.dead +} + +type errorOnce struct { + once sync.Once + err chan error + abort <-chan struct{} +} + +func NewErrorOnce(abort <-chan struct{}) ErrorOnce { + return &errorOnce{ + err: make(chan error, 1), + abort: abort, + } +} + +func (b *errorOnce) Err() <-chan error { + return b.err +} + +func (b *errorOnce) Reportf(msg string, args ...interface{}) { + b.Report(fmt.Errorf(msg, args...)) +} + +func (b *errorOnce) Report(err error) { + b.once.Do(func() { + select { + case b.err <- err: + default: + } + }) +} + +func (b *errorOnce) Send(errIn <-chan error) ErrorOnce { + go b.forward(errIn) + return b +} + +func (b *errorOnce) forward(errIn <-chan error) { + if errIn == nil { + b.Report(nil) + return + } + select { + case err, _ := <-errIn: + b.Report(err) + case <-b.abort: + b.Report(errProcessTerminated) + } +} + +type processAdapter struct { + parent Process + delegate Doer +} + +func (p *processAdapter) Do(a Action) <-chan error { + if p == nil || p.parent == nil || p.delegate == nil { + return ErrorChan(errIllegalState) + } + errCh := NewErrorOnce(p.Done()) + go func() { + errOuter := p.parent.Do(func() { + errInner := p.delegate.Do(a) + errCh.forward(errInner) + }) + // if the outer err is !nil then either the parent failed to schedule the + // the action, or else it backgrounded the scheduling task. + if errOuter != nil { + errCh.forward(errOuter) + } + }() + return errCh.Err() +} + +func (p *processAdapter) End() <-chan struct{} { + if p != nil && p.parent != nil { + return p.parent.End() + } + return nil +} + +func (p *processAdapter) Done() <-chan struct{} { + if p != nil && p.parent != nil { + return p.parent.Done() + } + return nil +} + +func (p *processAdapter) Running() <-chan struct{} { + if p != nil && p.parent != nil { + return p.parent.Running() + } + return nil +} + +func (p *processAdapter) OnError(ch <-chan error, f func(error)) <-chan struct{} { + if p != nil && p.parent != nil { + return p.parent.OnError(ch, f) + } + return nil +} + +// returns a process that, within its execution context, delegates to the specified Doer. +// if the given Doer instance is nil, a valid Process is still returned though calls to its +// Do() implementation will always return errIllegalState. +// if the given Process instance is nil then in addition to the behavior in the prior sentence, +// calls to End() and Done() are effectively noops. +func DoWith(other Process, d Doer) Process { + return &processAdapter{ + parent: other, + delegate: d, + } +} + +func ErrorChanf(msg string, args ...interface{}) <-chan error { + return ErrorChan(fmt.Errorf(msg, args...)) +} + +func ErrorChan(err error) <-chan error { + if err == nil { + return closedErrChan + } + ch := make(chan error, 1) + ch <- err + return ch +} + +// invoke the f on action a. returns an illegal state error if f is nil. +func (f DoerFunc) Do(a Action) <-chan error { + if f != nil { + return f(a) + } + return ErrorChan(errIllegalState) +} diff --git a/contrib/mesos/pkg/proc/proc_test.go b/contrib/mesos/pkg/proc/proc_test.go new file mode 100644 index 00000000000..31c034465f6 --- /dev/null +++ b/contrib/mesos/pkg/proc/proc_test.go @@ -0,0 +1,373 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package proc + +import ( + "fmt" + "sync" + "testing" + "time" + + "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/runtime" + log "github.com/golang/glog" +) + +// logs a testing.Fatalf if the elapsed time d passes before signal chan done is closed +func fatalAfter(t *testing.T, done <-chan struct{}, d time.Duration, msg string, args ...interface{}) { + select { + case <-done: + case <-time.After(d): + t.Fatalf(msg, args...) + } +} + +func errorAfter(errOnce ErrorOnce, done <-chan struct{}, d time.Duration, msg string, args ...interface{}) { + select { + case <-done: + case <-time.After(d): + errOnce.Reportf(msg, args...) + } +} + +// logs a testing.Fatalf if the signal chan closes before the elapsed time d passes +func fatalOn(t *testing.T, done <-chan struct{}, d time.Duration, msg string, args ...interface{}) { + select { + case <-done: + t.Fatalf(msg, args...) + case <-time.After(d): + } +} + +func TestProc_manyEndings(t *testing.T) { + p := New() + const COUNT = 20 + var wg sync.WaitGroup + wg.Add(COUNT) + for i := 0; i < COUNT; i++ { + runtime.On(p.End(), wg.Done) + } + fatalAfter(t, runtime.After(wg.Wait), 5*time.Second, "timed out waiting for loose End()s") + fatalAfter(t, p.Done(), 5*time.Second, "timed out waiting for process death") +} + +func TestProc_singleAction(t *testing.T) { + p := New() + scheduled := make(chan struct{}) + called := make(chan struct{}) + + go func() { + log.Infof("do'ing deferred action") + defer close(scheduled) + err := p.Do(func() { + defer close(called) + log.Infof("deferred action invoked") + }) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + }() + + fatalAfter(t, scheduled, 5*time.Second, "timed out waiting for deferred action to be scheduled") + fatalAfter(t, called, 5*time.Second, "timed out waiting for deferred action to be invoked") + + p.End() + + fatalAfter(t, p.Done(), 5*time.Second, "timed out waiting for process death") +} + +func TestProc_singleActionEnd(t *testing.T) { + p := New() + scheduled := make(chan struct{}) + called := make(chan struct{}) + + go func() { + log.Infof("do'ing deferred action") + defer close(scheduled) + err := p.Do(func() { + defer close(called) + log.Infof("deferred action invoked") + p.End() + }) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + }() + + fatalAfter(t, scheduled, 5*time.Second, "timed out waiting for deferred action to be scheduled") + fatalAfter(t, called, 5*time.Second, "timed out waiting for deferred action to be invoked") + fatalAfter(t, p.Done(), 5*time.Second, "timed out waiting for process death") +} + +func TestProc_multiAction(t *testing.T) { + p := New() + const COUNT = 10 + var called sync.WaitGroup + called.Add(COUNT) + + // test FIFO property + next := 0 + for i := 0; i < COUNT; i++ { + log.Infof("do'ing deferred action %d", i) + idx := i + err := p.Do(func() { + defer called.Done() + log.Infof("deferred action invoked") + if next != idx { + t.Fatalf("expected index %d instead of %d", idx, next) + } + next++ + }) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + } + + fatalAfter(t, runtime.After(called.Wait), 2*time.Second, "timed out waiting for deferred actions to be invoked") + + p.End() + + fatalAfter(t, p.Done(), 5*time.Second, "timed out waiting for process death") +} + +func TestProc_goodLifecycle(t *testing.T) { + p := New() + p.End() + fatalAfter(t, p.Done(), 5*time.Second, "timed out waiting for process death") +} + +func TestProc_doWithDeadProc(t *testing.T) { + p := New() + p.End() + time.Sleep(100 * time.Millisecond) + + errUnexpected := fmt.Errorf("unexpected execution of delegated action") + decorated := DoWith(p, DoerFunc(func(_ Action) <-chan error { + return ErrorChan(errUnexpected) + })) + + decorated.Do(func() {}) + fatalAfter(t, decorated.Done(), 5*time.Second, "timed out waiting for process death") +} + +func TestProc_doWith(t *testing.T) { + p := New() + + delegated := false + decorated := DoWith(p, DoerFunc(func(a Action) <-chan error { + delegated = true + a() + return nil + })) + + executed := make(chan struct{}) + err := decorated.Do(func() { + defer close(executed) + if !delegated { + t.Fatalf("expected delegated execution") + } + }) + if err == nil { + t.Fatalf("expected !nil error chan") + } + + fatalAfter(t, executed, 5*time.Second, "timed out waiting deferred execution") + fatalAfter(t, decorated.OnError(err, func(e error) { + t.Fatalf("unexpected error: %v", err) + }), 1*time.Second, "timed out waiting for doer result") + + decorated.End() + fatalAfter(t, p.Done(), 5*time.Second, "timed out waiting for process death") +} + +func TestProc_doWithNestedTwice(t *testing.T) { + p := New() + + delegated := false + decorated := DoWith(p, DoerFunc(func(a Action) <-chan error { + a() + return nil + })) + + decorated2 := DoWith(decorated, DoerFunc(func(a Action) <-chan error { + delegated = true + a() + return nil + })) + + executed := make(chan struct{}) + err := decorated2.Do(func() { + defer close(executed) + if !delegated { + t.Fatalf("expected delegated execution") + } + }) + if err == nil { + t.Fatalf("expected !nil error chan") + } + + fatalAfter(t, executed, 5*time.Second, "timed out waiting deferred execution") + fatalAfter(t, decorated2.OnError(err, func(e error) { + t.Fatalf("unexpected error: %v", err) + }), 1*time.Second, "timed out waiting for doer result") + + decorated2.End() + fatalAfter(t, p.Done(), 5*time.Second, "timed out waiting for process death") +} + +func TestProc_doWithNestedErrorPropagation(t *testing.T) { + p := New() + + delegated := false + decorated := DoWith(p, DoerFunc(func(a Action) <-chan error { + a() + return nil + })) + + expectedErr := fmt.Errorf("expecting this") + errOnce := NewErrorOnce(p.Done()) + decorated2 := DoWith(decorated, DoerFunc(func(a Action) <-chan error { + delegated = true + a() + errOnce.Reportf("unexpected error in decorator2") + return ErrorChanf("another unexpected error in decorator2") + })) + + executed := make(chan struct{}) + err := decorated2.Do(func() { + defer close(executed) + if !delegated { + t.Fatalf("expected delegated execution") + } + errOnce.Report(expectedErr) + }) + if err == nil { + t.Fatalf("expected !nil error chan") + } + errOnce.Send(err) + + foundError := false + fatalAfter(t, executed, 1*time.Second, "timed out waiting deferred execution") + fatalAfter(t, decorated2.OnError(errOnce.Err(), func(e error) { + if e != expectedErr { + t.Fatalf("unexpected error: %v", err) + } else { + foundError = true + } + }), 1*time.Second, "timed out waiting for doer result") + + if !foundError { + t.Fatalf("expected a propagated error") + } + + decorated2.End() + fatalAfter(t, p.Done(), 5*time.Second, "timed out waiting for process death") +} + +func runDelegationTest(t *testing.T, p Process, name string, errOnce ErrorOnce) { + defer func() { + t.Logf("runDelegationTest finished at " + time.Now().String()) + }() + var decorated Process + decorated = p + + const DEPTH = 100 + var wg sync.WaitGroup + wg.Add(DEPTH) + y := 0 + + for x := 1; x <= DEPTH; x++ { + x := x + nextp := DoWith(decorated, DoerFunc(func(a Action) <-chan error { + if x == 1 { + t.Logf("delegate chain invoked for " + name) + } + y++ + if y != x { + return ErrorChanf("out of order delegated execution") + } + defer wg.Done() + a() + return nil + })) + decorated = nextp + } + + executed := make(chan struct{}) + errCh := decorated.Do(func() { + defer close(executed) + if y != DEPTH { + errOnce.Reportf("expected delegated execution") + } + t.Logf("executing deferred action: " + name + " at " + time.Now().String()) + errOnce.Send(nil) // we completed without error, let the listener know + }) + if errCh == nil { + t.Fatalf("expected !nil error chan") + } + + // forward any scheduling errors to the listener; NOTHING else should attempt to read + // from errCh after this point + errOnce.Send(errCh) + + errorAfter(errOnce, executed, 5*time.Second, "timed out waiting deferred execution") + t.Logf("runDelegationTest received executed signal at " + time.Now().String()) +} + +func TestProc_doWithNestedX(t *testing.T) { + t.Logf("starting test case at " + time.Now().String()) + p := New() + errOnce := NewErrorOnce(p.Done()) + runDelegationTest(t, p, "nested", errOnce) + <-p.End() + select { + case err := <-errOnce.Err(): + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + case <-time.After(5 * time.Second): + t.Fatalf("timed out waiting for doer result") + } + fatalAfter(t, p.Done(), 5*time.Second, "timed out waiting for process death") +} + +// intended to be run with -race +func TestProc_doWithNestedXConcurrent(t *testing.T) { + p := New() + errOnce := NewErrorOnce(p.Done()) + var wg sync.WaitGroup + const CONC = 20 + wg.Add(CONC) + for i := 0; i < CONC; i++ { + i := i + runtime.After(func() { runDelegationTest(t, p, fmt.Sprintf("nested%d", i), errOnce) }).Then(wg.Done) + } + ch := runtime.After(wg.Wait) + fatalAfter(t, ch, 10*time.Second, "timed out waiting for concurrent delegates") + + <-p.End() + + select { + case err := <-errOnce.Err(): + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + case <-time.After(5 * time.Second): + t.Fatalf("timed out waiting for doer result") + } + + fatalAfter(t, p.Done(), 5*time.Second, "timed out waiting for process death") +} diff --git a/contrib/mesos/pkg/proc/state.go b/contrib/mesos/pkg/proc/state.go new file mode 100644 index 00000000000..f35a2ea8382 --- /dev/null +++ b/contrib/mesos/pkg/proc/state.go @@ -0,0 +1,55 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package proc + +import ( + "sync/atomic" +) + +type stateType int32 + +const ( + stateNew stateType = iota + stateRunning + stateTerminal +) + +func (s *stateType) get() stateType { + return stateType(atomic.LoadInt32((*int32)(s))) +} + +func (s *stateType) transition(from, to stateType) bool { + return atomic.CompareAndSwapInt32((*int32)(s), int32(from), int32(to)) +} + +func (s *stateType) transitionTo(to stateType, unless ...stateType) bool { + if len(unless) == 0 { + atomic.StoreInt32((*int32)(s), int32(to)) + return true + } + for { + state := s.get() + for _, x := range unless { + if state == x { + return false + } + } + if s.transition(state, to) { + return true + } + } +} diff --git a/contrib/mesos/pkg/proc/types.go b/contrib/mesos/pkg/proc/types.go new file mode 100644 index 00000000000..d2cae458b15 --- /dev/null +++ b/contrib/mesos/pkg/proc/types.go @@ -0,0 +1,71 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package proc + +// something that executes in the context of a process +type Action func() + +type Context interface { + // end (terminate) the execution context + End() <-chan struct{} + + // return a signal chan that will close upon the termination of this process + Done() <-chan struct{} +} + +type Doer interface { + // execute some action in some context. actions are to be executed in a + // concurrency-safe manner: no two actions should execute at the same time. + // errors are generated if the action cannot be executed (not by the execution + // of the action) and should be testable with the error API of this package, + // for example, IsProcessTerminated. + Do(Action) <-chan error +} + +// adapter func for Doer interface +type DoerFunc func(Action) <-chan error + +type Process interface { + Context + Doer + + // see top level OnError func. this implementation will terminate upon the arrival of + // an error (and subsequently invoke the error handler, if given) or else the termination + // of the process (testable via IsProcessTerminated). + OnError(<-chan error, func(error)) <-chan struct{} + + // return a signal chan that will close once the process is ready to run actions + Running() <-chan struct{} +} + +// this is an error promise. if we ever start building out support for other promise types it will probably +// make sense to group them in some sort of "promises" package. +type ErrorOnce interface { + // return a chan that only ever sends one error, either obtained via Report() or Forward() + Err() <-chan error + + // reports the given error via Err(), but only if no other errors have been reported or forwarded + Report(error) + Reportf(string, ...interface{}) + + // waits for an error on the incoming chan, the result of which is later obtained via Err() (if no + // other errors have been reported or forwarded) + forward(<-chan error) + + // non-blocking, spins up a goroutine that reports an error (if any) that occurs on the error chan. + Send(<-chan error) ErrorOnce +} diff --git a/contrib/mesos/pkg/profile/doc.go b/contrib/mesos/pkg/profile/doc.go new file mode 100644 index 00000000000..041a3c914d8 --- /dev/null +++ b/contrib/mesos/pkg/profile/doc.go @@ -0,0 +1,18 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package profile contains reusable code for profiling Go programs with pprof. +package profile diff --git a/contrib/mesos/pkg/profile/profile.go b/contrib/mesos/pkg/profile/profile.go new file mode 100644 index 00000000000..a24fe8a07ae --- /dev/null +++ b/contrib/mesos/pkg/profile/profile.go @@ -0,0 +1,27 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package profile + +import "net/http" +import "net/http/pprof" + +func InstallHandler(m *http.ServeMux) { + // register similar endpoints as net/http/pprof.init() does + m.Handle("/debug/pprof/", http.HandlerFunc(pprof.Index)) + m.Handle("/debug/pprof/profile", http.HandlerFunc(pprof.Profile)) + m.Handle("/debug/pprof/symbol", http.HandlerFunc(pprof.Symbol)) +} diff --git a/contrib/mesos/pkg/queue/delay.go b/contrib/mesos/pkg/queue/delay.go new file mode 100644 index 00000000000..39e93281cd6 --- /dev/null +++ b/contrib/mesos/pkg/queue/delay.go @@ -0,0 +1,373 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package queue + +import ( + "container/heap" + "sync" + "time" + + "github.com/GoogleCloudPlatform/kubernetes/pkg/util" +) + +type qitem struct { + value interface{} + priority Priority + index int + readd func(item *qitem) // re-add the value of the item to the queue +} + +// A priorityQueue implements heap.Interface and holds qitems. +type priorityQueue []*qitem + +func (pq priorityQueue) Len() int { return len(pq) } + +func (pq priorityQueue) Less(i, j int) bool { + return pq[i].priority.ts.Before(pq[j].priority.ts) +} + +func (pq priorityQueue) Swap(i, j int) { + pq[i], pq[j] = pq[j], pq[i] + pq[i].index = i + pq[j].index = j +} + +func (pq *priorityQueue) Push(x interface{}) { + n := len(*pq) + item := x.(*qitem) + item.index = n + *pq = append(*pq, item) +} + +func (pq *priorityQueue) Pop() interface{} { + old := *pq + n := len(old) + item := old[n-1] + item.index = -1 // for safety + *pq = old[0 : n-1] + return item +} + +// concurrency-safe, deadline-oriented queue that returns items after their +// delay period has expired. +type DelayQueue struct { + queue priorityQueue + lock sync.RWMutex + cond sync.Cond +} + +func NewDelayQueue() *DelayQueue { + q := &DelayQueue{} + q.cond.L = &q.lock + return q +} + +func (q *DelayQueue) Add(d Delayed) { + deadline := extractFromDelayed(d) + + q.lock.Lock() + defer q.lock.Unlock() + + // readd using the original deadline computed from the original delay + var readd func(*qitem) + readd = func(qp *qitem) { + q.lock.Lock() + defer q.lock.Unlock() + heap.Push(&q.queue, &qitem{ + value: d, + priority: deadline, + readd: readd, + }) + q.cond.Broadcast() + } + heap.Push(&q.queue, &qitem{ + value: d, + priority: deadline, + readd: readd, + }) + q.cond.Broadcast() +} + +// If there's a deadline reported by d.Deadline() then `d` is added to the +// queue and this func returns true. +func (q *DelayQueue) Offer(d Deadlined) bool { + deadline, ok := extractFromDeadlined(d) + if ok { + q.lock.Lock() + defer q.lock.Unlock() + heap.Push(&q.queue, &qitem{ + value: d, + priority: deadline, + readd: func(qp *qitem) { + q.Offer(qp.value.(Deadlined)) + }, + }) + q.cond.Broadcast() + } + return ok +} + +// wait for the delay of the next item in the queue to expire, blocking if +// there are no items in the queue. does not guarantee first-come-first-serve +// ordering with respect to clients. +func (q *DelayQueue) Pop() interface{} { + // doesn't implement cancellation, will always return a non-nil value + return q.pop(func() *qitem { + q.lock.Lock() + defer q.lock.Unlock() + for q.queue.Len() == 0 { + q.cond.Wait() + } + x := heap.Pop(&q.queue) + item := x.(*qitem) + return item + }, nil) +} + +// returns a non-nil value from the queue, or else nil if/when cancelled; if cancel +// is nil then cancellation is disabled and this func must return a non-nil value. +func (q *DelayQueue) pop(next func() *qitem, cancel <-chan struct{}) interface{} { + var ch chan struct{} + for { + item := next() + if item == nil { + // cancelled + return nil + } + x := item.value + waitingPeriod := item.priority.ts.Sub(time.Now()) + if waitingPeriod >= 0 { + // listen for calls to Add() while we're waiting for the deadline + if ch == nil { + ch = make(chan struct{}, 1) + } + go func() { + q.lock.Lock() + defer q.lock.Unlock() + q.cond.Wait() + ch <- struct{}{} + }() + select { + case <-cancel: + item.readd(item) + return nil + case <-ch: + // we may no longer have the earliest deadline, re-try + item.readd(item) + continue + case <-time.After(waitingPeriod): + // noop + case <-item.priority.notify: + // noop + } + } + return x + } +} + +// If multiple adds/updates of a single item happen while an item is in the +// queue before it has been processed, it will only be processed once, and +// when it is processed, the most recent version will be processed. Items are +// popped in order of their priority, currently controlled by a delay or +// deadline assigned to each item in the queue. +type DelayFIFO struct { + // internal deadline-based priority queue + delegate *DelayQueue + // We depend on the property that items in the set are in the queue and vice versa. + items map[string]*qitem + deadlinePolicy DeadlinePolicy +} + +func (q *DelayFIFO) lock() { + q.delegate.lock.Lock() +} + +func (q *DelayFIFO) unlock() { + q.delegate.lock.Unlock() +} + +func (q *DelayFIFO) rlock() { + q.delegate.lock.RLock() +} + +func (q *DelayFIFO) runlock() { + q.delegate.lock.RUnlock() +} + +func (q *DelayFIFO) queue() *priorityQueue { + return &q.delegate.queue +} + +func (q *DelayFIFO) cond() *sync.Cond { + return &q.delegate.cond +} + +// Add inserts an item, and puts it in the queue. The item is only enqueued +// if it doesn't already exist in the set. +func (q *DelayFIFO) Add(d UniqueDelayed, rp ReplacementPolicy) { + deadline := extractFromDelayed(d) + id := d.GetUID() + var adder func(*qitem) + adder = func(*qitem) { + q.add(id, deadline, d, KeepExisting, adder) + } + q.add(id, deadline, d, rp, adder) +} + +func (q *DelayFIFO) Offer(d UniqueDeadlined, rp ReplacementPolicy) bool { + if deadline, ok := extractFromDeadlined(d); ok { + id := d.GetUID() + q.add(id, deadline, d, rp, func(qp *qitem) { q.Offer(qp.value.(UniqueDeadlined), KeepExisting) }) + return true + } + return false +} + +func (q *DelayFIFO) add(id string, deadline Priority, value interface{}, rp ReplacementPolicy, adder func(*qitem)) { + q.lock() + defer q.unlock() + if item, exists := q.items[id]; !exists { + item = &qitem{ + value: value, + priority: deadline, + readd: adder, + } + heap.Push(q.queue(), item) + q.items[id] = item + } else { + // this is an update of an existing item + item.value = rp.replacementValue(item.value, value) + item.priority = q.deadlinePolicy.nextDeadline(item.priority, deadline) + heap.Fix(q.queue(), item.index) + } + q.cond().Broadcast() +} + +// Delete removes an item. It doesn't add it to the queue, because +// this implementation assumes the consumer only cares about the objects, +// not their priority order. +func (f *DelayFIFO) Delete(id string) { + f.lock() + defer f.unlock() + delete(f.items, id) +} + +// List returns a list of all the items. +func (f *DelayFIFO) List() []UniqueID { + f.rlock() + defer f.runlock() + list := make([]UniqueID, 0, len(f.items)) + for _, item := range f.items { + list = append(list, item.value.(UniqueDelayed)) + } + return list +} + +// ContainedIDs returns a util.StringSet containing all IDs of the stored items. +// This is a snapshot of a moment in time, and one should keep in mind that +// other go routines can add or remove items after you call this. +func (c *DelayFIFO) ContainedIDs() util.StringSet { + c.rlock() + defer c.runlock() + set := util.StringSet{} + for id := range c.items { + set.Insert(id) + } + return set +} + +// Get returns the requested item, or sets exists=false. +func (f *DelayFIFO) Get(id string) (UniqueID, bool) { + f.rlock() + defer f.runlock() + if item, exists := f.items[id]; exists { + return item.value.(UniqueID), true + } + return nil, false +} + +// Variant of DelayQueue.Pop() for UniqueDelayed items +func (q *DelayFIFO) Await(timeout time.Duration) UniqueID { + cancel := make(chan struct{}) + ch := make(chan interface{}, 1) + go func() { ch <- q.pop(cancel) }() + var x interface{} + select { + case <-time.After(timeout): + close(cancel) + x = <-ch + case x = <-ch: + // noop + } + if x != nil { + return x.(UniqueID) + } + return nil +} + +// Variant of DelayQueue.Pop() for UniqueDelayed items +func (q *DelayFIFO) Pop() UniqueID { + return q.pop(nil).(UniqueID) +} + +// variant of DelayQueue.Pop that implements optional cancellation +func (q *DelayFIFO) pop(cancel chan struct{}) interface{} { + next := func() *qitem { + q.lock() + defer q.unlock() + for { + for q.queue().Len() == 0 { + signal := make(chan struct{}) + go func() { + defer close(signal) + q.cond().Wait() + }() + select { + case <-cancel: + // we may not have the lock yet, so + // broadcast to abort Wait, then + // return after lock re-acquisition + q.cond().Broadcast() + <-signal + return nil + case <-signal: + // we have the lock, re-check + // the queue for data... + } + } + x := heap.Pop(q.queue()) + item := x.(*qitem) + unique := item.value.(UniqueID) + uid := unique.GetUID() + if _, ok := q.items[uid]; !ok { + // item was deleted, keep looking + continue + } + delete(q.items, uid) + return item + } + } + return q.delegate.pop(next, cancel) +} + +func NewDelayFIFO() *DelayFIFO { + f := &DelayFIFO{ + delegate: NewDelayQueue(), + items: map[string]*qitem{}, + } + return f +} diff --git a/contrib/mesos/pkg/queue/delay_test.go b/contrib/mesos/pkg/queue/delay_test.go new file mode 100644 index 00000000000..df0ea940a07 --- /dev/null +++ b/contrib/mesos/pkg/queue/delay_test.go @@ -0,0 +1,406 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package queue + +import ( + "sync/atomic" + "testing" + "time" + + "github.com/stretchr/testify/assert" +) + +const ( + tolerance = 100 * time.Millisecond // go time delays aren't perfect, this is our tolerance for errors WRT expected timeouts +) + +func timedPriority(t time.Time) Priority { + return Priority{ts: t} +} + +func TestPQ(t *testing.T) { + t.Parallel() + + var pq priorityQueue + if pq.Len() != 0 { + t.Fatalf("pq should be empty") + } + + now := timedPriority(time.Now()) + now2 := timedPriority(now.ts.Add(2 * time.Second)) + pq.Push(&qitem{priority: now2}) + if pq.Len() != 1 { + t.Fatalf("pq.len should be 1") + } + x := pq.Pop() + if x == nil { + t.Fatalf("x is nil") + } + if pq.Len() != 0 { + t.Fatalf("pq should be empty") + } + item := x.(*qitem) + if !item.priority.Equal(now2) { + t.Fatalf("item.priority != now2") + } + + pq.Push(&qitem{priority: now2}) + pq.Push(&qitem{priority: now2}) + pq.Push(&qitem{priority: now2}) + pq.Push(&qitem{priority: now2}) + pq.Push(&qitem{priority: now2}) + pq.Pop() + pq.Pop() + pq.Pop() + pq.Pop() + pq.Pop() + if pq.Len() != 0 { + t.Fatalf("pq should be empty") + } + now4 := timedPriority(now.ts.Add(4 * time.Second)) + now6 := timedPriority(now.ts.Add(4 * time.Second)) + pq.Push(&qitem{priority: now2}) + pq.Push(&qitem{priority: now4}) + pq.Push(&qitem{priority: now6}) + pq.Swap(0, 2) + if !pq[0].priority.Equal(now6) || !pq[2].priority.Equal(now2) { + t.Fatalf("swap failed") + } + if pq.Less(1, 2) { + t.Fatalf("now4 < now2") + } +} + +func TestPopEmptyPQ(t *testing.T) { + t.Parallel() + defer func() { + if r := recover(); r == nil { + t.Fatalf("Expected panic from popping an empty PQ") + } + }() + var pq priorityQueue + pq.Pop() +} + +type testjob struct { + d time.Duration + t time.Time + deadline *time.Time + uid string + instance int +} + +func (j *testjob) GetDelay() time.Duration { + return j.d +} + +func (j testjob) GetUID() string { + return j.uid +} + +func (td *testjob) Deadline() (deadline time.Time, ok bool) { + if td.deadline != nil { + return *td.deadline, true + } else { + return time.Now(), false + } +} + +func TestDQ_sanity_check(t *testing.T) { + t.Parallel() + + dq := NewDelayQueue() + delay := 2 * time.Second + dq.Add(&testjob{d: delay}) + + before := time.Now() + x := dq.Pop() + + now := time.Now() + waitPeriod := now.Sub(before) + + if waitPeriod+tolerance < delay { + t.Fatalf("delay too short: %v, expected: %v", waitPeriod, delay) + } + if x == nil { + t.Fatalf("x is nil") + } + item := x.(*testjob) + if item.d != delay { + t.Fatalf("d != delay") + } +} + +func TestDQ_Offer(t *testing.T) { + t.Parallel() + assert := assert.New(t) + + dq := NewDelayQueue() + delay := time.Second + + added := dq.Offer(&testjob{}) + if added { + t.Fatalf("DelayQueue should not add offered job without deadline") + } + + deadline := time.Now().Add(delay) + added = dq.Offer(&testjob{deadline: &deadline}) + if !added { + t.Fatalf("DelayQueue should add offered job with deadline") + } + + before := time.Now() + x := dq.Pop() + + now := time.Now() + waitPeriod := now.Sub(before) + + if waitPeriod+tolerance < delay { + t.Fatalf("delay too short: %v, expected: %v", waitPeriod, delay) + } + assert.NotNil(x) + assert.Equal(x.(*testjob).deadline, &deadline) +} + +func TestDQ_ordered_add_pop(t *testing.T) { + t.Parallel() + + dq := NewDelayQueue() + dq.Add(&testjob{d: 2 * time.Second}) + dq.Add(&testjob{d: 1 * time.Second}) + dq.Add(&testjob{d: 3 * time.Second}) + + var finished [3]*testjob + before := time.Now() + idx := int32(-1) + ch := make(chan bool, 3) + //TODO: replace with `for range finished` once Go 1.3 support is dropped + for n := 0; n < len(finished); n++ { + go func() { + var ok bool + x := dq.Pop() + i := atomic.AddInt32(&idx, 1) + if finished[i], ok = x.(*testjob); !ok { + t.Fatalf("expected a *testjob, not %v", x) + } + finished[i].t = time.Now() + ch <- true + }() + } + <-ch + <-ch + <-ch + + after := time.Now() + totalDelay := after.Sub(before) + if totalDelay+tolerance < (3 * time.Second) { + t.Fatalf("totalDelay < 3s: %v", totalDelay) + } + for i, v := range finished { + if v == nil { + t.Fatalf("task %d was nil", i) + } + expected := time.Duration(i+1) * time.Second + if v.d != expected { + t.Fatalf("task %d had delay-priority %v, expected %v", i, v.d, expected) + } + actualDelay := v.t.Sub(before) + if actualDelay+tolerance < v.d { + t.Fatalf("task %d had actual-delay %v < expected delay %v", i, actualDelay, v.d) + } + } +} + +func TestDQ_always_pop_earliest_deadline(t *testing.T) { + t.Parallel() + + // add a testjob with delay of 2s + // spawn a func f1 that attempts to Pop() and wait for f1 to begin + // add a testjob with a delay of 1s + // check that the func f1 actually popped the 1s task (not the 2s task) + + dq := NewDelayQueue() + dq.Add(&testjob{d: 2 * time.Second}) + ch := make(chan *testjob) + started := make(chan bool) + + go func() { + started <- true + x := dq.Pop() + job := x.(*testjob) + job.t = time.Now() + ch <- job + }() + + <-started + time.Sleep(500 * time.Millisecond) // give plently of time for Pop() to enter + expected := 1 * time.Second + dq.Add(&testjob{d: expected}) + job := <-ch + + if expected != job.d { + t.Fatalf("Expected delay-prority of %v got instead got %v", expected, job.d) + } + + job = dq.Pop().(*testjob) + expected = 2 * time.Second + if expected != job.d { + t.Fatalf("Expected delay-prority of %v got instead got %v", expected, job.d) + } +} + +func TestDQ_always_pop_earliest_deadline_multi(t *testing.T) { + t.Parallel() + + dq := NewDelayQueue() + dq.Add(&testjob{d: 2 * time.Second}) + + ch := make(chan *testjob) + multi := 10 + started := make(chan bool, multi) + + go func() { + started <- true + for i := 0; i < multi; i++ { + x := dq.Pop() + job := x.(*testjob) + job.t = time.Now() + ch <- job + } + }() + + <-started + time.Sleep(500 * time.Millisecond) // give plently of time for Pop() to enter + expected := 1 * time.Second + + for i := 0; i < multi; i++ { + dq.Add(&testjob{d: expected}) + } + for i := 0; i < multi; i++ { + job := <-ch + if expected != job.d { + t.Fatalf("Expected delay-prority of %v got instead got %v", expected, job.d) + } + } + + job := dq.Pop().(*testjob) + expected = 2 * time.Second + if expected != job.d { + t.Fatalf("Expected delay-prority of %v got instead got %v", expected, job.d) + } +} + +func TestDQ_negative_delay(t *testing.T) { + t.Parallel() + + dq := NewDelayQueue() + delay := -2 * time.Second + dq.Add(&testjob{d: delay}) + + before := time.Now() + x := dq.Pop() + + now := time.Now() + waitPeriod := now.Sub(before) + + if waitPeriod > tolerance { + t.Fatalf("delay too long: %v, expected something less than: %v", waitPeriod, tolerance) + } + if x == nil { + t.Fatalf("x is nil") + } + item := x.(*testjob) + if item.d != delay { + t.Fatalf("d != delay") + } +} + +func TestDFIFO_sanity_check(t *testing.T) { + t.Parallel() + assert := assert.New(t) + + df := NewDelayFIFO() + delay := 2 * time.Second + df.Add(&testjob{d: delay, uid: "a", instance: 1}, ReplaceExisting) + assert.True(df.ContainedIDs().Has("a")) + + // re-add by ReplaceExisting + df.Add(&testjob{d: delay, uid: "a", instance: 2}, ReplaceExisting) + assert.True(df.ContainedIDs().Has("a")) + + a, ok := df.Get("a") + assert.True(ok) + assert.Equal(a.(*testjob).instance, 2) + + // re-add by KeepExisting + df.Add(&testjob{d: delay, uid: "a", instance: 3}, KeepExisting) + assert.True(df.ContainedIDs().Has("a")) + + a, ok = df.Get("a") + assert.True(ok) + assert.Equal(a.(*testjob).instance, 2) + + // pop last + before := time.Now() + x := df.Pop() + assert.Equal(a.(*testjob).instance, 2) + + now := time.Now() + waitPeriod := now.Sub(before) + + if waitPeriod+tolerance < delay { + t.Fatalf("delay too short: %v, expected: %v", waitPeriod, delay) + } + if x == nil { + t.Fatalf("x is nil") + } + item := x.(*testjob) + if item.d != delay { + t.Fatalf("d != delay") + } +} + +func TestDFIFO_Offer(t *testing.T) { + t.Parallel() + assert := assert.New(t) + + dq := NewDelayFIFO() + delay := time.Second + + added := dq.Offer(&testjob{instance: 1}, ReplaceExisting) + if added { + t.Fatalf("DelayFIFO should not add offered job without deadline") + } + + deadline := time.Now().Add(delay) + added = dq.Offer(&testjob{deadline: &deadline, instance: 2}, ReplaceExisting) + if !added { + t.Fatalf("DelayFIFO should add offered job with deadline") + } + + before := time.Now() + x := dq.Pop() + + now := time.Now() + waitPeriod := now.Sub(before) + + if waitPeriod+tolerance < delay { + t.Fatalf("delay too short: %v, expected: %v", waitPeriod, delay) + } + assert.NotNil(x) + assert.Equal(x.(*testjob).instance, 2) +} diff --git a/contrib/mesos/pkg/queue/doc.go b/contrib/mesos/pkg/queue/doc.go new file mode 100644 index 00000000000..c35bd971bc7 --- /dev/null +++ b/contrib/mesos/pkg/queue/doc.go @@ -0,0 +1,19 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package queue provides several queue implementations, originally +// inspired by Kubernetes pkg/client/cache/fifo. +package queue diff --git a/contrib/mesos/pkg/queue/historical.go b/contrib/mesos/pkg/queue/historical.go new file mode 100644 index 00000000000..09148acdb83 --- /dev/null +++ b/contrib/mesos/pkg/queue/historical.go @@ -0,0 +1,403 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package queue + +import ( + "fmt" + "reflect" + "sync" + "time" + + "github.com/GoogleCloudPlatform/kubernetes/pkg/util" +) + +type entry struct { + value UniqueCopyable + event EventType +} + +type deletedEntry struct { + *entry + expiration time.Time +} + +func (e *entry) Value() UniqueCopyable { + return e.value +} + +func (e *entry) Copy() Copyable { + if e == nil { + return nil + } + return &entry{e.value.Copy().(UniqueCopyable), e.event} +} + +func (e *entry) Is(types EventType) bool { + return types&e.event != 0 +} + +func (e *deletedEntry) Copy() Copyable { + if e == nil { + return nil + } + return &deletedEntry{e.entry.Copy().(*entry), e.expiration} +} + +// deliver a message +type pigeon func(msg Entry) + +func dead(msg Entry) { + // intentionally blank +} + +// HistoricalFIFO receives adds and updates from a Reflector, and puts them in a queue for +// FIFO order processing. If multiple adds/updates of a single item happen while +// an item is in the queue before it has been processed, it will only be +// processed once, and when it is processed, the most recent version will be +// processed. This can't be done with a channel. +type HistoricalFIFO struct { + lock sync.RWMutex + cond sync.Cond + items map[string]Entry // We depend on the property that items in the queue are in the set. + queue []string + carrier pigeon // may be dead, but never nil + gcc int + lingerTTL time.Duration +} + +// panics if obj doesn't implement UniqueCopyable; otherwise returns the same, typecast object +func checkType(obj interface{}) UniqueCopyable { + if v, ok := obj.(UniqueCopyable); !ok { + panic(fmt.Sprintf("Illegal object type, expected UniqueCopyable: %T", obj)) + } else { + return v + } +} + +// Add inserts an item, and puts it in the queue. The item is only enqueued +// if it doesn't already exist in the set. +func (f *HistoricalFIFO) Add(v interface{}) error { + obj := checkType(v) + notifications := []Entry(nil) + defer func() { + for _, e := range notifications { + f.carrier(e) + } + }() + + f.lock.Lock() + defer f.lock.Unlock() + + id := obj.GetUID() + if entry, exists := f.items[id]; !exists { + f.queue = append(f.queue, id) + } else { + if entry.Is(DELETE_EVENT | POP_EVENT) { + f.queue = append(f.queue, id) + } + } + notifications = f.merge(id, obj) + f.cond.Broadcast() + return nil +} + +// Update is the same as Add in this implementation. +func (f *HistoricalFIFO) Update(obj interface{}) error { + return f.Add(obj) +} + +// Delete removes an item. It doesn't add it to the queue, because +// this implementation assumes the consumer only cares about the objects, +// not the order in which they were created/added. +func (f *HistoricalFIFO) Delete(v interface{}) error { + obj := checkType(v) + deleteEvent := (Entry)(nil) + defer func() { + f.carrier(deleteEvent) + }() + + f.lock.Lock() + defer f.lock.Unlock() + id := obj.GetUID() + item, exists := f.items[id] + if exists && !item.Is(DELETE_EVENT) { + e := item.(*entry) + e.event = DELETE_EVENT + deleteEvent = &deletedEntry{e, time.Now().Add(f.lingerTTL)} + f.items[id] = deleteEvent + } + return nil +} + +// List returns a list of all the items. +func (f *HistoricalFIFO) List() []interface{} { + f.lock.RLock() + defer f.lock.RUnlock() + + // TODO(jdef): slightly overallocates b/c of deleted items + list := make([]interface{}, 0, len(f.queue)) + + for _, entry := range f.items { + if entry.Is(DELETE_EVENT | POP_EVENT) { + continue + } + list = append(list, entry.Value().Copy()) + } + return list +} + +// List returns a list of all the items. +func (f *HistoricalFIFO) ListKeys() []string { + f.lock.RLock() + defer f.lock.RUnlock() + + // TODO(jdef): slightly overallocates b/c of deleted items + list := make([]string, 0, len(f.queue)) + + for key, entry := range f.items { + if entry.Is(DELETE_EVENT | POP_EVENT) { + continue + } + list = append(list, key) + } + return list +} + +// ContainedIDs returns a util.StringSet containing all IDs of the stored items. +// This is a snapshot of a moment in time, and one should keep in mind that +// other go routines can add or remove items after you call this. +func (c *HistoricalFIFO) ContainedIDs() util.StringSet { + c.lock.RLock() + defer c.lock.RUnlock() + set := util.StringSet{} + for id, entry := range c.items { + if entry.Is(DELETE_EVENT | POP_EVENT) { + continue + } + set.Insert(id) + } + return set +} + +// Get returns the requested item, or sets exists=false. +func (f *HistoricalFIFO) Get(v interface{}) (interface{}, bool, error) { + obj := checkType(v) + return f.GetByKey(obj.GetUID()) +} + +// Get returns the requested item, or sets exists=false. +func (f *HistoricalFIFO) GetByKey(id string) (interface{}, bool, error) { + f.lock.RLock() + defer f.lock.RUnlock() + entry, exists := f.items[id] + if exists && !entry.Is(DELETE_EVENT|POP_EVENT) { + return entry.Value().Copy(), true, nil + } + return nil, false, nil +} + +// Get returns the requested item, or sets exists=false. +func (f *HistoricalFIFO) Poll(id string, t EventType) bool { + f.lock.RLock() + defer f.lock.RUnlock() + entry, exists := f.items[id] + return exists && entry.Is(t) +} + +// Variant of DelayQueue.Pop() for UniqueDelayed items +func (q *HistoricalFIFO) Await(timeout time.Duration) interface{} { + cancel := make(chan struct{}) + ch := make(chan interface{}, 1) + go func() { ch <- q.pop(cancel) }() + select { + case <-time.After(timeout): + close(cancel) + return <-ch + case x := <-ch: + return x + } +} +func (f *HistoricalFIFO) Pop() interface{} { + return f.pop(nil) +} + +func (f *HistoricalFIFO) pop(cancel chan struct{}) interface{} { + popEvent := (Entry)(nil) + defer func() { + f.carrier(popEvent) + }() + + f.lock.Lock() + defer f.lock.Unlock() + for { + for len(f.queue) == 0 { + signal := make(chan struct{}) + go func() { + defer close(signal) + f.cond.Wait() + }() + select { + case <-cancel: + // we may not have the lock yet, so + // broadcast to abort Wait, then + // return after lock re-acquisition + f.cond.Broadcast() + <-signal + return nil + case <-signal: + // we have the lock, re-check + // the queue for data... + } + } + id := f.queue[0] + f.queue = f.queue[1:] + item, ok := f.items[id] + if !ok || item.Is(DELETE_EVENT|POP_EVENT) { + // Item may have been deleted subsequently. + continue + } + value := item.Value() + popEvent = &entry{value, POP_EVENT} + f.items[id] = popEvent + return value.Copy() + } +} + +func (f *HistoricalFIFO) Replace(objs []interface{}) error { + notifications := make([]Entry, 0, len(objs)) + defer func() { + for _, e := range notifications { + f.carrier(e) + } + }() + + idToObj := make(map[string]interface{}) + for _, v := range objs { + obj := checkType(v) + idToObj[obj.GetUID()] = v + } + + f.lock.Lock() + defer f.lock.Unlock() + + f.queue = f.queue[:0] + now := time.Now() + for id, v := range f.items { + if _, exists := idToObj[id]; !exists && !v.Is(DELETE_EVENT) { + // a non-deleted entry in the items list that doesn't show up in the + // new list: mark it as deleted + ent := v.(*entry) + ent.event = DELETE_EVENT + e := &deletedEntry{ent, now.Add(f.lingerTTL)} + f.items[id] = e + notifications = append(notifications, e) + } + } + for id, v := range idToObj { + obj := checkType(v) + f.queue = append(f.queue, id) + n := f.merge(id, obj) + notifications = append(notifications, n...) + } + if len(f.queue) > 0 { + f.cond.Broadcast() + } + return nil +} + +// garbage collect DELETEd items whose TTL has expired; the IDs of such items are removed +// from the queue. This impl assumes that caller has acquired state lock. +func (f *HistoricalFIFO) gc() { + now := time.Now() + deleted := make(map[string]struct{}) + for id, v := range f.items { + if v.Is(DELETE_EVENT) { + ent := v.(*deletedEntry) + if ent.expiration.Before(now) { + delete(f.items, id) + deleted[id] = struct{}{} + } + } + } + // remove deleted items from the queue, will likely (slightly) overallocate here + queue := make([]string, 0, len(f.queue)) + for _, id := range f.queue { + if _, exists := deleted[id]; !exists { + queue = append(queue, id) + } + } + f.queue = queue +} + +// Assumes that the caller has acquired the state lock. +func (f *HistoricalFIFO) merge(id string, obj UniqueCopyable) (notifications []Entry) { + item, exists := f.items[id] + now := time.Now() + if !exists { + e := &entry{obj.Copy().(UniqueCopyable), ADD_EVENT} + f.items[id] = e + notifications = append(notifications, e) + } else { + if !item.Is(DELETE_EVENT) && item.Value().GetUID() != obj.GetUID() { + // hidden DELETE! + // (1) append a DELETE + // (2) append an ADD + // .. and notify listeners in that order + ent := item.(*entry) + ent.event = DELETE_EVENT + e1 := &deletedEntry{ent, now.Add(f.lingerTTL)} + e2 := &entry{obj.Copy().(UniqueCopyable), ADD_EVENT} + f.items[id] = e2 + notifications = append(notifications, e1, e2) + } else if !reflect.DeepEqual(obj, item.Value()) { + //TODO(jdef): it would be nice if we could rely on resource versions + //instead of doing a DeepEqual. Maybe someday we'll be able to. + e := &entry{obj.Copy().(UniqueCopyable), UPDATE_EVENT} + f.items[id] = e + notifications = append(notifications, e) + } + } + // check for garbage collection + f.gcc++ + if f.gcc%256 == 0 { //TODO(jdef): extract constant + f.gcc = 0 + f.gc() + } + return +} + +// NewHistorical returns a Store which can be used to queue up items to +// process. If a non-nil Mux is provided, then modifications to the +// the FIFO are delivered on a channel specific to this fifo. +func NewHistorical(ch chan<- Entry) FIFO { + carrier := dead + if ch != nil { + carrier = func(msg Entry) { + if msg != nil { + ch <- msg.Copy().(Entry) + } + } + } + f := &HistoricalFIFO{ + items: map[string]Entry{}, + queue: []string{}, + carrier: carrier, + lingerTTL: 5 * time.Minute, // TODO(jdef): extract constant + } + f.cond.L = &f.lock + return f +} diff --git a/contrib/mesos/pkg/queue/historical_test.go b/contrib/mesos/pkg/queue/historical_test.go new file mode 100644 index 00000000000..4477601beda --- /dev/null +++ b/contrib/mesos/pkg/queue/historical_test.go @@ -0,0 +1,191 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package queue + +import ( + "fmt" + "testing" + "time" +) + +type _int int +type _uint uint + +func (i _int) Copy() Copyable { + return i +} + +func (i _int) GetUID() string { + return fmt.Sprintf("INT%d", int(i)) +} + +func (i _uint) Copy() Copyable { + return i +} + +func (i _uint) GetUID() string { + return fmt.Sprintf("UINT%d", uint64(i)) +} + +type testObj struct { + id string + value int +} + +func (i *testObj) Copy() Copyable { + if i == nil { + return nil + } else { + return &testObj{i.id, i.value} + } +} + +func (i *testObj) GetUID() string { + return i.id +} + +func TestFIFO_basic(t *testing.T) { + f := NewHistorical(nil) + const amount = 500 + go func() { + for i := 0; i < amount; i++ { + f.Add(_int(i + 1)) + } + }() + go func() { + for u := uint(0); u < amount; u++ { + f.Add(_uint(u + 1)) + } + }() + + lastInt := _int(0) + lastUint := _uint(0) + for i := 0; i < amount*2; i++ { + switch obj := f.Pop().(type) { + case _int: + if obj <= lastInt { + t.Errorf("got %v (int) out of order, last was %v", obj, lastInt) + } + lastInt = obj + case _uint: + if obj <= lastUint { + t.Errorf("got %v (uint) out of order, last was %v", obj, lastUint) + } else { + lastUint = obj + } + default: + t.Fatalf("unexpected type %#v", obj) + } + } +} + +func TestFIFO_addUpdate(t *testing.T) { + f := NewHistorical(nil) + f.Add(&testObj{"foo", 10}) + f.Update(&testObj{"foo", 15}) + got := make(chan *testObj, 2) + go func() { + for { + got <- f.Pop().(*testObj) + } + }() + + first := <-got + if e, a := 15, first.value; e != a { + t.Errorf("Didn't get updated value (%v), got %v", e, a) + } + select { + case unexpected := <-got: + t.Errorf("Got second value %v", unexpected) + case <-time.After(50 * time.Millisecond): + } + _, exists, _ := f.GetByKey("foo") + if exists { + t.Errorf("item did not get removed") + } +} + +func TestFIFO_addReplace(t *testing.T) { + f := NewHistorical(nil) + f.Add(&testObj{"foo", 10}) + f.Replace([]interface{}{&testObj{"foo", 15}}) + got := make(chan *testObj, 2) + go func() { + for { + got <- f.Pop().(*testObj) + } + }() + + first := <-got + if e, a := 15, first.value; e != a { + t.Errorf("Didn't get updated value (%v), got %v", e, a) + } + select { + case unexpected := <-got: + t.Errorf("Got second value %v", unexpected) + case <-time.After(50 * time.Millisecond): + } + _, exists, _ := f.GetByKey("foo") + if exists { + t.Errorf("item did not get removed") + } +} + +func TestFIFO_detectLineJumpers(t *testing.T) { + f := NewHistorical(nil) + + f.Add(&testObj{"foo", 10}) + f.Add(&testObj{"bar", 1}) + f.Add(&testObj{"foo", 11}) + f.Add(&testObj{"foo", 13}) + f.Add(&testObj{"zab", 30}) + + err := error(nil) + done := make(chan struct{}) + go func() { + defer close(done) + if e, a := 13, f.Pop().(*testObj).value; a != e { + err = fmt.Errorf("expected %d, got %d", e, a) + return + } + + f.Add(&testObj{"foo", 14}) // ensure foo doesn't jump back in line + + if e, a := 1, f.Pop().(*testObj).value; a != e { + err = fmt.Errorf("expected %d, got %d", e, a) + return + } + + if e, a := 30, f.Pop().(*testObj).value; a != e { + err = fmt.Errorf("expected %d, got %d", e, a) + return + } + + if e, a := 14, f.Pop().(*testObj).value; a != e { + err = fmt.Errorf("expected %d, got %d", e, a) + return + } + }() + select { + case <-done: + if err != nil { + t.Fatal(err) + } + case <-time.After(1 * time.Second): + t.Fatal("Deadlocked unit test") + } +} diff --git a/contrib/mesos/pkg/queue/interface.go b/contrib/mesos/pkg/queue/interface.go new file mode 100644 index 00000000000..7191552bfbd --- /dev/null +++ b/contrib/mesos/pkg/queue/interface.go @@ -0,0 +1,103 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package queue + +import ( + "time" + + "github.com/GoogleCloudPlatform/kubernetes/pkg/client/cache" +) + +type EventType int + +const ( + ADD_EVENT EventType = 1 << iota + UPDATE_EVENT + DELETE_EVENT + POP_EVENT +) + +type Entry interface { + Copyable + Value() UniqueCopyable + // types is a logically OR'd combination of EventType, e.g. ADD_EVENT|UPDATE_EVENT + Is(types EventType) bool +} + +type Copyable interface { + // return an independent copy (deep clone) of the current object + Copy() Copyable +} + +type UniqueID interface { + GetUID() string +} + +type UniqueCopyable interface { + Copyable + UniqueID +} + +type FIFO interface { + cache.Store + + // Pop waits until an item is ready and returns it. If multiple items are + // ready, they are returned in the order in which they were added/updated. + // The item is removed from the queue (and the store) before it is returned, + // so if you don't succesfully process it, you need to add it back with Add(). + Pop() interface{} + + // Await attempts to Pop within the given interval; upon success the non-nil + // item is returned, otherwise nil + Await(timeout time.Duration) interface{} + + // Is there an entry for the id that matches the event mask? + Poll(id string, types EventType) bool +} + +type Delayed interface { + // return the remaining delay; a non-positive value indicates no delay + GetDelay() time.Duration +} + +type Deadlined interface { + // when ok, returns the time when this object should be activated/executed/evaluated + Deadline() (deadline time.Time, ok bool) +} + +// No objects are ever expected to be sent over this channel. References to BreakChan +// instances may be nil (always blocking). Signalling over this channel is performed by +// closing the channel. As such there can only ever be a single signal sent over the +// lifetime of the channel. +type BreakChan <-chan struct{} + +// an optional interface to be implemented by Delayed objects; returning a nil +// channel from Breaker() results in waiting the full delay duration +type Breakout interface { + // return a channel that signals early departure from a blocking delay + Breaker() BreakChan +} + +type UniqueDelayed interface { + UniqueID + Delayed +} + +type UniqueDeadlined interface { + UniqueID + Deadlined +} diff --git a/contrib/mesos/pkg/queue/policy.go b/contrib/mesos/pkg/queue/policy.go new file mode 100644 index 00000000000..5798aec927d --- /dev/null +++ b/contrib/mesos/pkg/queue/policy.go @@ -0,0 +1,70 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package queue + +// Decide whether a pre-existing deadline for an item in a delay-queue should be +// updated if an attempt is made to offer/add a new deadline for said item. Whether +// the deadline changes or not has zero impact on the data blob associated with the +// entry in the queue. +type DeadlinePolicy int + +const ( + PreferLatest DeadlinePolicy = iota + PreferEarliest +) + +// Decide whether a pre-existing data blob in a delay-queue should be replaced if an +// an attempt is made to add/offer a new data blob in its place. Whether the data is +// replaced has no bearing on the deadline (priority) of the item in the queue. +type ReplacementPolicy int + +const ( + KeepExisting ReplacementPolicy = iota + ReplaceExisting +) + +func (rp ReplacementPolicy) replacementValue(original, replacement interface{}) (result interface{}) { + switch rp { + case KeepExisting: + result = original + case ReplaceExisting: + fallthrough + default: + result = replacement + } + return +} + +func (dp DeadlinePolicy) nextDeadline(a, b Priority) (result Priority) { + switch dp { + case PreferEarliest: + if a.ts.Before(b.ts) { + result = a + } else { + result = b + } + case PreferLatest: + fallthrough + default: + if a.ts.After(b.ts) { + result = a + } else { + result = b + } + } + return +} diff --git a/contrib/mesos/pkg/queue/priority.go b/contrib/mesos/pkg/queue/priority.go new file mode 100644 index 00000000000..f2ccb8b735e --- /dev/null +++ b/contrib/mesos/pkg/queue/priority.go @@ -0,0 +1,56 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package queue + +import ( + "time" +) + +type Priority struct { + ts time.Time // timestamp + notify BreakChan // notification channel +} + +func (p Priority) Equal(other Priority) bool { + return p.ts.Equal(other.ts) && p.notify == other.notify +} + +func extractFromDelayed(d Delayed) Priority { + deadline := time.Now().Add(d.GetDelay()) + breaker := BreakChan(nil) + if breakout, good := d.(Breakout); good { + breaker = breakout.Breaker() + } + return Priority{ + ts: deadline, + notify: breaker, + } +} + +func extractFromDeadlined(d Deadlined) (Priority, bool) { + if ts, ok := d.Deadline(); ok { + breaker := BreakChan(nil) + if breakout, good := d.(Breakout); good { + breaker = breakout.Breaker() + } + return Priority{ + ts: ts, + notify: breaker, + }, true + } + return Priority{}, false +} diff --git a/contrib/mesos/pkg/redirfd/doc.go b/contrib/mesos/pkg/redirfd/doc.go new file mode 100644 index 00000000000..1092ad941d4 --- /dev/null +++ b/contrib/mesos/pkg/redirfd/doc.go @@ -0,0 +1,19 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Some file descriptor manipulation funcs (Unix-Only), inspired by +// https://github.com/skarnet/execline/blob/master/src/execline/redirfd.c +package redirfd diff --git a/contrib/mesos/pkg/redirfd/file_descriptor.go b/contrib/mesos/pkg/redirfd/file_descriptor.go new file mode 100644 index 00000000000..2c717e15c9f --- /dev/null +++ b/contrib/mesos/pkg/redirfd/file_descriptor.go @@ -0,0 +1,41 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package redirfd + +import ( + "fmt" + "strconv" +) + +// FileDescriptor mirrors unix-specific indexes for cross-platform use +type FileDescriptor int + +const ( + InvalidFD FileDescriptor = -1 + Stdin FileDescriptor = 0 + Stdout FileDescriptor = 1 + Stderr FileDescriptor = 2 +) + +// ParseFileDescriptor parses a string formatted file descriptor +func ParseFileDescriptor(fdstr string) (FileDescriptor, error) { + fdint, err := strconv.Atoi(fdstr) + if err != nil { + return InvalidFD, fmt.Errorf("file descriptor must be an integer: %q", fdstr) + } + return FileDescriptor(fdint), nil +} diff --git a/contrib/mesos/pkg/redirfd/file_descriptor_test.go b/contrib/mesos/pkg/redirfd/file_descriptor_test.go new file mode 100644 index 00000000000..787f2294455 --- /dev/null +++ b/contrib/mesos/pkg/redirfd/file_descriptor_test.go @@ -0,0 +1,54 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package redirfd + +import ( + "testing" + + . "github.com/onsi/gomega" +) + +func TestParseFileDescriptor(t *testing.T) { + RegisterTestingT(t) + + valid := map[string]FileDescriptor{ + "-1": InvalidFD, + "0": Stdin, + "1": Stdout, + "2": Stderr, + "3": FileDescriptor(3), + } + + for input, expected := range valid { + fd, err := ParseFileDescriptor(input) + Expect(err).ToNot(HaveOccurred(), "Input: '%s'", input) + Expect(fd).To(Equal(expected), "Input: '%s'", input) + } + + invalid := []string{ + "a", + " 1", + "blue", + "stderr", + "STDERR", + } + + for _, input := range invalid { + _, err := ParseFileDescriptor(input) + Expect(err).To(HaveOccurred(), "Input: '%s'", input) + } +} diff --git a/contrib/mesos/pkg/redirfd/redirfd_unix.go b/contrib/mesos/pkg/redirfd/redirfd_unix.go new file mode 100644 index 00000000000..a2159e1c98e --- /dev/null +++ b/contrib/mesos/pkg/redirfd/redirfd_unix.go @@ -0,0 +1,208 @@ +// +build !windows + +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package redirfd + +import ( + "fmt" + "os" + "syscall" +) + +type RedirectMode int + +const ( + Read RedirectMode = iota // open file for reading + Write // open file for writing, truncating if it exists + Update // open file for read & write + Append // open file for append, create if it does not exist + AppendExisting // open file for append, do not create if it does not already exist + WriteNew // open file for writing, creating it, failing if it already exists +) + +// see https://github.com/skarnet/execline/blob/master/src/execline/redirfd.c +func (mode RedirectMode) Redirect(nonblock, changemode bool, fd FileDescriptor, name string) (*os.File, error) { + flags := 0 + what := -1 + + switch mode { + case Read: + what = syscall.O_RDONLY + flags &= ^(syscall.O_APPEND | syscall.O_CREAT | syscall.O_TRUNC | syscall.O_EXCL) + case Write: + what = syscall.O_WRONLY + flags |= syscall.O_CREAT | syscall.O_TRUNC + flags &= ^(syscall.O_APPEND | syscall.O_EXCL) + case Update: + what = syscall.O_RDWR + flags &= ^(syscall.O_APPEND | syscall.O_CREAT | syscall.O_TRUNC | syscall.O_EXCL) + case Append: + what = syscall.O_WRONLY + flags |= syscall.O_CREAT | syscall.O_APPEND + flags &= ^(syscall.O_TRUNC | syscall.O_EXCL) + case AppendExisting: + what = syscall.O_WRONLY + flags |= syscall.O_APPEND + flags &= ^(syscall.O_CREAT | syscall.O_TRUNC | syscall.O_EXCL) + case WriteNew: + what = syscall.O_WRONLY + flags |= syscall.O_CREAT | syscall.O_EXCL + flags &= ^(syscall.O_APPEND | syscall.O_TRUNC) + default: + return nil, fmt.Errorf("unexpected mode %d", mode) + } + if nonblock { + flags |= syscall.O_NONBLOCK + } + flags |= what + + fd2, e := open(name, flags, 0666) + if (what == syscall.O_WRONLY) && (e == syscall.ENXIO) { + // Opens file in read-only, non-blocking mode. Returns a valid fd number if it succeeds, or -1 (and sets errno) if it fails. + fdr, e2 := open(name, syscall.O_RDONLY|syscall.O_NONBLOCK, 0) + if e2 != nil { + return nil, &os.PathError{"open_read", name, e2} + } + fd2, e = open(name, flags, 0666) + fd_close(fdr) + } + if e != nil { + return nil, &os.PathError{"open", name, e} + } + if e = fd_move(fd, fd2); e != nil { + return nil, &os.PathError{"fd_move", name, e} + } + if changemode { + if nonblock { + e = ndelay_off(fd) + } else { + e = ndelay_on(fd) + } + if e != nil { + return nil, &os.PathError{"ndelay", name, e} + } + } + return os.NewFile(uintptr(fd2), name), nil +} + +// proxy to return a FileDescriptor +func open(path string, openmode int, perm uint32) (FileDescriptor, error) { + fdint, err := syscall.Open(path, openmode, perm) + return FileDescriptor(fdint), err +} + +// see https://github.com/skarnet/skalibs/blob/master/src/libstddjb/fd_move.c +func fd_move(to, from FileDescriptor) (err error) { + if to == from { + return + } + for { + _, _, e1 := syscall.RawSyscall(syscall.SYS_DUP2, uintptr(from), uintptr(to), 0) + if e1 != syscall.EINTR { + if e1 != 0 { + err = e1 + } + break + } + } + if err != nil { + err = fd_close(from) + } + return + /* + do + r = dup2(from, to) ; + while ((r == -1) && (errno == EINTR)) ; + return (r == -1) ? -1 : fd_close(from) ; + */ +} + +// see https://github.com/skarnet/skalibs/blob/master/src/libstddjb/fd_close.c +func fd_close(fd FileDescriptor) (err error) { + i := 0 + var e error + for { + if e = syscall.Close(int(fd)); e != nil { + return nil + } + i++ + if e != syscall.EINTR { + break + } + } + if e == syscall.EBADF && i > 1 { + return nil + } + return e +} + +/* +int fd_close (int fd) +{ + register unsigned int i = 0 ; +doit: + if (!close(fd)) return 0 ; + i++ ; + if (errno == EINTR) goto doit ; + return ((errno == EBADF) && (i > 1)) ? 0 : -1 ; +} +*/ + +// see https://github.com/skarnet/skalibs/blob/master/src/libstddjb/ndelay_on.c +func ndelay_on(fd FileDescriptor) error { + // 32-bit will likely break because it needs SYS_FCNTL64 + got, _, e := syscall.Syscall(syscall.SYS_FCNTL, uintptr(fd), uintptr(syscall.F_GETFL), 0) + if e != 0 { + return e + } + _, _, e = syscall.Syscall(syscall.SYS_FCNTL, uintptr(fd), uintptr(syscall.F_SETFL), uintptr(got|syscall.O_NONBLOCK)) + if e != 0 { + return e + } + return nil +} + +/* +int ndelay_on (int fd) +{ + register int got = fcntl(fd, F_GETFL) ; + return (got == -1) ? -1 : fcntl(fd, F_SETFL, got | O_NONBLOCK) ; +} +*/ + +// see https://github.com/skarnet/skalibs/blob/master/src/libstddjb/ndelay_off.c +func ndelay_off(fd FileDescriptor) error { + // 32-bit will likely break because it needs SYS_FCNTL64 + got, _, e := syscall.Syscall(syscall.SYS_FCNTL, uintptr(fd), uintptr(syscall.F_GETFL), 0) + if e != 0 { + return e + } + _, _, e = syscall.Syscall(syscall.SYS_FCNTL, uintptr(fd), uintptr(syscall.F_SETFL), uintptr(int(got) & ^syscall.O_NONBLOCK)) + if e != 0 { + return e + } + return nil +} + +/* +int ndelay_off (int fd) +{ + register int got = fcntl(fd, F_GETFL) ; + return (got == -1) ? -1 : fcntl(fd, F_SETFL, got & ^O_NONBLOCK) ; +} +*/ diff --git a/contrib/mesos/pkg/redirfd/redirfd_windows.go b/contrib/mesos/pkg/redirfd/redirfd_windows.go new file mode 100644 index 00000000000..609d158d2d4 --- /dev/null +++ b/contrib/mesos/pkg/redirfd/redirfd_windows.go @@ -0,0 +1,39 @@ +// +build windows + +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package redirfd + +import ( + "fmt" + "os" +) + +type RedirectMode int + +const ( + Read RedirectMode = iota // open file for reading + Write // open file for writing, truncating if it exists + Update // open file for read & write + Append // open file for append, create if it does not exist + AppendExisting // open file for append, do not create if it does not already exist + WriteNew // open file for writing, creating it, failing if it already exists +) + +func (mode RedirectMode) Redirect(nonblock, changemode bool, fd FileDescriptor, name string) (*os.File, error) { + return nil, fmt.Errorf("Redirect(%s, %s, %d, \"%s\") not supported on windows", nonblock, changemode, fd, name) +} diff --git a/contrib/mesos/pkg/runtime/doc.go b/contrib/mesos/pkg/runtime/doc.go new file mode 100644 index 00000000000..7acc851bb99 --- /dev/null +++ b/contrib/mesos/pkg/runtime/doc.go @@ -0,0 +1,19 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package runtime provides utilities for semaphores (chan struct{}), +// a simple Latch implementation, and metrics for reporting handled panics. +package runtime diff --git a/contrib/mesos/pkg/runtime/latch.go b/contrib/mesos/pkg/runtime/latch.go new file mode 100644 index 00000000000..93514ae46c7 --- /dev/null +++ b/contrib/mesos/pkg/runtime/latch.go @@ -0,0 +1,35 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package runtime + +import ( + "sync/atomic" +) + +type Latch struct { + int32 +} + +// return true if this latch was successfully acquired. concurrency safe. will only return true +// upon the first invocation, all subsequent invocations will return false. always returns false +// when self is nil. +func (self *Latch) Acquire() bool { + if self == nil { + return false + } + return atomic.CompareAndSwapInt32(&self.int32, 0, 1) +} diff --git a/contrib/mesos/pkg/runtime/latch_test.go b/contrib/mesos/pkg/runtime/latch_test.go new file mode 100644 index 00000000000..5bb4600f02d --- /dev/null +++ b/contrib/mesos/pkg/runtime/latch_test.go @@ -0,0 +1,61 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package runtime + +import ( + "sync" + "sync/atomic" + "testing" + "time" +) + +func Test_LatchAcquireBasic(t *testing.T) { + var x Latch + if !x.Acquire() { + t.Fatalf("expected first acquire to succeed") + } + if x.Acquire() { + t.Fatalf("expected second acquire to fail") + } + if x.Acquire() { + t.Fatalf("expected third acquire to fail") + } +} + +func Test_LatchAcquireConcurrent(t *testing.T) { + var x Latch + const NUM = 10 + ch := make(chan struct{}) + var success int32 + var wg sync.WaitGroup + wg.Add(NUM) + for i := 0; i < NUM; i++ { + go func() { + defer wg.Done() + <-ch + if x.Acquire() { + atomic.AddInt32(&success, 1) + } + }() + } + time.Sleep(200 * time.Millisecond) + close(ch) + wg.Wait() + if success != 1 { + t.Fatalf("expected single acquire to succeed instead of %d", success) + } +} diff --git a/contrib/mesos/pkg/runtime/metrics.go b/contrib/mesos/pkg/runtime/metrics.go new file mode 100644 index 00000000000..ba25f92a23c --- /dev/null +++ b/contrib/mesos/pkg/runtime/metrics.go @@ -0,0 +1,47 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package runtime + +import ( + "sync" + + "github.com/GoogleCloudPlatform/kubernetes/pkg/util" + "github.com/prometheus/client_golang/prometheus" +) + +const ( + runtimeSubsystem = "runtime" +) + +var ( + panicCounter = prometheus.NewCounter( + prometheus.CounterOpts{ + Subsystem: runtimeSubsystem, + Name: "panics", + Help: "Counter of panics handled by the internal crash handler.", + }, + ) +) + +var registerMetrics sync.Once + +func Register() { + registerMetrics.Do(func() { + prometheus.MustRegister(panicCounter) + util.PanicHandlers = append(util.PanicHandlers, func(interface{}) { panicCounter.Inc() }) + }) +} diff --git a/contrib/mesos/pkg/runtime/util.go b/contrib/mesos/pkg/runtime/util.go new file mode 100644 index 00000000000..ed7974245eb --- /dev/null +++ b/contrib/mesos/pkg/runtime/util.go @@ -0,0 +1,122 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package runtime + +import ( + "os" + "sync" + "time" + + "github.com/GoogleCloudPlatform/kubernetes/pkg/util" +) + +type Signal <-chan struct{} + +// return a func that will close the signal chan. +// multiple invocations of the returned func will not generate a panic. +// two funcs from separate invocations of Closer() (on the same sig chan) will cause a panic if both invoked. +// for example: +// // good +// x := runtime.After(func() { ... }) +// f := x.Closer() +// f() +// f() +// +// // bad +// x := runtime.After(func() { ... }) +// f := x.Closer() +// g := x.Closer() +// f() +// g() // this will panic +func Closer(sig chan<- struct{}) func() { + var once sync.Once + return func() { + once.Do(func() { close(sig) }) + } +} + +// upon receiving signal sig invoke function f and immediately return a signal +// that indicates f's completion. used to chain handler funcs, for example: +// On(job.Done(), response.Send).Then(wg.Done) +func (sig Signal) Then(f func()) Signal { + if sig == nil { + return nil + } + return On(sig, f) +} + +// execute a callback function after the specified signal chan closes. +// immediately returns a signal that indicates f's completion. +func On(sig <-chan struct{}, f func()) Signal { + if sig == nil { + return nil + } + return After(func() { + <-sig + if f != nil { + f() + } + }) +} + +func OnOSSignal(sig <-chan os.Signal, f func(os.Signal)) Signal { + if sig == nil { + return nil + } + return After(func() { + if s, ok := <-sig; ok && f != nil { + f(s) + } + }) +} + +// spawn a goroutine to execute a func, immediately returns a chan that closes +// upon completion of the func. returns a nil signal chan if the given func is nil. +func After(f func()) Signal { + ch := make(chan struct{}) + go func() { + defer close(ch) + defer util.HandleCrash() + if f != nil { + f() + } + }() + return Signal(ch) +} + +// periodically execute the given function, stopping once stopCh is closed. +// this func blocks until stopCh is closed, it's intended to be run as a goroutine. +func Until(f func(), period time.Duration, stopCh <-chan struct{}) { + if f == nil { + return + } + for { + select { + case <-stopCh: + return + default: + } + func() { + defer util.HandleCrash() + f() + }() + select { + case <-stopCh: + case <-time.After(period): + } + } +} diff --git a/contrib/mesos/pkg/runtime/util_test.go b/contrib/mesos/pkg/runtime/util_test.go new file mode 100644 index 00000000000..7c9cc1fb22c --- /dev/null +++ b/contrib/mesos/pkg/runtime/util_test.go @@ -0,0 +1,64 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package runtime + +import ( + "testing" + "time" +) + +func TestUntil(t *testing.T) { + ch := make(chan struct{}) + close(ch) + Until(func() { + t.Fatal("should not have been invoked") + }, 0, ch) + + //-- + ch = make(chan struct{}) + called := make(chan struct{}) + After(func() { + Until(func() { + called <- struct{}{} + }, 0, ch) + }).Then(func() { close(called) }) + + <-called + close(ch) + <-called + + //-- + ch = make(chan struct{}) + called = make(chan struct{}) + running := make(chan struct{}) + After(func() { + Until(func() { + close(running) + called <- struct{}{} + }, 2*time.Second, ch) + }).Then(func() { close(called) }) + + <-running + close(ch) + <-called // unblock the goroutine + now := time.Now() + + <-called + if time.Since(now) > 1800*time.Millisecond { + t.Fatalf("Until should not have waited the full timeout period since we closed the stop chan") + } +} diff --git a/contrib/mesos/pkg/scheduler/config/config.go b/contrib/mesos/pkg/scheduler/config/config.go new file mode 100644 index 00000000000..5290729b482 --- /dev/null +++ b/contrib/mesos/pkg/scheduler/config/config.go @@ -0,0 +1,109 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package config + +import ( + "io" + "time" + + "code.google.com/p/gcfg" +) + +const ( + DefaultOfferTTL = 5 * time.Second // duration an offer is viable, prior to being expired + DefaultOfferLingerTTL = 120 * time.Second // duration an expired offer lingers in history + DefaultListenerDelay = 1 * time.Second // duration between offer listener notifications + DefaultUpdatesBacklog = 2048 // size of the pod updates channel + DefaultFrameworkIdRefreshInterval = 30 * time.Second // interval we update the frameworkId stored in etcd + DefaultInitialImplicitReconciliationDelay = 15 * time.Second // wait this amount of time after initial registration before attempting implicit reconciliation + DefaultExplicitReconciliationMaxBackoff = 2 * time.Minute // interval in between internal task status checks/updates + DefaultExplicitReconciliationAbortTimeout = 30 * time.Second // waiting period after attempting to cancel an ongoing reconciliation + DefaultInitialPodBackoff = 1 * time.Second + DefaultMaxPodBackoff = 60 * time.Second + DefaultHttpHandlerTimeout = 10 * time.Second + DefaultHttpBindInterval = 5 * time.Second +) + +// Example scheduler configuration file: +// +// [scheduler] +// info-name = Kubernetes +// offer-ttl = 5s +// offer-linger-ttl = 2m + +type ConfigWrapper struct { + Scheduler Config +} + +type Config struct { + OfferTTL WrappedDuration `gcfg:"offer-ttl"` + OfferLingerTTL WrappedDuration `gcfg:"offer-linger-ttl"` + ListenerDelay WrappedDuration `gcfg:"listener-delay"` + UpdatesBacklog int `gcfg:"updates-backlog"` + FrameworkIdRefreshInterval WrappedDuration `gcfg:"framework-id-refresh-interval"` + InitialImplicitReconciliationDelay WrappedDuration `gcfg:"initial-implicit-reconciliation-delay"` + ExplicitReconciliationMaxBackoff WrappedDuration `gcfg:"explicit-reconciliantion-max-backoff"` + ExplicitReconciliationAbortTimeout WrappedDuration `gcfg:"explicit-reconciliantion-abort-timeout"` + InitialPodBackoff WrappedDuration `gcfg:"initial-pod-backoff"` + MaxPodBackoff WrappedDuration `gcfg:"max-pod-backoff"` + HttpHandlerTimeout WrappedDuration `gcfg:"http-handler-timeout"` + HttpBindInterval WrappedDuration `gcfg:"http-bind-interval"` +} + +type WrappedDuration struct { + time.Duration +} + +func (wd *WrappedDuration) UnmarshalText(data []byte) error { + d, err := time.ParseDuration(string(data)) + if err == nil { + wd.Duration = d + } + return err +} + +func (c *Config) SetDefaults() { + c.OfferTTL = WrappedDuration{DefaultOfferTTL} + c.OfferLingerTTL = WrappedDuration{DefaultOfferLingerTTL} + c.ListenerDelay = WrappedDuration{DefaultListenerDelay} + c.UpdatesBacklog = DefaultUpdatesBacklog + c.FrameworkIdRefreshInterval = WrappedDuration{DefaultFrameworkIdRefreshInterval} + c.InitialImplicitReconciliationDelay = WrappedDuration{DefaultInitialImplicitReconciliationDelay} + c.ExplicitReconciliationMaxBackoff = WrappedDuration{DefaultExplicitReconciliationMaxBackoff} + c.ExplicitReconciliationAbortTimeout = WrappedDuration{DefaultExplicitReconciliationAbortTimeout} + c.InitialPodBackoff = WrappedDuration{DefaultInitialPodBackoff} + c.MaxPodBackoff = WrappedDuration{DefaultMaxPodBackoff} + c.HttpHandlerTimeout = WrappedDuration{DefaultHttpHandlerTimeout} + c.HttpBindInterval = WrappedDuration{DefaultHttpBindInterval} +} + +func CreateDefaultConfig() *Config { + c := &Config{} + c.SetDefaults() + return c +} + +func (c *Config) Read(configReader io.Reader) error { + wrapper := &ConfigWrapper{Scheduler: *c} + if configReader != nil { + if err := gcfg.ReadInto(wrapper, configReader); err != nil { + return err + } + *c = wrapper.Scheduler + } + return nil +} diff --git a/contrib/mesos/pkg/scheduler/config/config_test.go b/contrib/mesos/pkg/scheduler/config/config_test.go new file mode 100644 index 00000000000..c316b3bc557 --- /dev/null +++ b/contrib/mesos/pkg/scheduler/config/config_test.go @@ -0,0 +1,112 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package config + +import ( + "strings" + "testing" + "time" + + "github.com/stretchr/testify/assert" +) + +func is_default(c *Config, t *testing.T) { + assert := assert.New(t) + + assert.Equal(DefaultOfferTTL, c.OfferTTL.Duration) + assert.Equal(DefaultOfferLingerTTL, c.OfferLingerTTL.Duration) + assert.Equal(DefaultListenerDelay, c.ListenerDelay.Duration) + assert.Equal(DefaultUpdatesBacklog, c.UpdatesBacklog) + assert.Equal(DefaultFrameworkIdRefreshInterval, c.FrameworkIdRefreshInterval.Duration) + assert.Equal(DefaultInitialImplicitReconciliationDelay, c.InitialImplicitReconciliationDelay.Duration) + assert.Equal(DefaultExplicitReconciliationMaxBackoff, c.ExplicitReconciliationMaxBackoff.Duration) + assert.Equal(DefaultExplicitReconciliationAbortTimeout, c.ExplicitReconciliationAbortTimeout.Duration) + assert.Equal(DefaultInitialPodBackoff, c.InitialPodBackoff.Duration) + assert.Equal(DefaultMaxPodBackoff, c.MaxPodBackoff.Duration) + assert.Equal(DefaultHttpHandlerTimeout, c.HttpHandlerTimeout.Duration) + assert.Equal(DefaultHttpBindInterval, c.HttpBindInterval.Duration) +} + +// Check that SetDefaults sets the default values +func TestConfig_SetDefaults(t *testing.T) { + c := &Config{} + c.SetDefaults() + is_default(c, t) +} + +// Check that CreateDefaultConfig returns a default config +func TestConfig_CreateDefaultConfig(t *testing.T) { + c := CreateDefaultConfig() + is_default(c, t) +} + +// Check that a config string can be parsed +func TestConfig_Read(t *testing.T) { + assert := assert.New(t) + + c := CreateDefaultConfig() + reader := strings.NewReader(` + [scheduler] + offer-ttl=42s + offer-linger-ttl=42s + listener-delay=42s + updates-backlog=42 + framework-id-refresh-interval=42s + initial-implicit-reconciliation-delay=42s + explicit-reconciliantion-max-backoff=42s + explicit-reconciliantion-abort-timeout=42s + initial-pod-backoff=42s + max-pod-backoff=42s + http-handler-timeout=42s + http-bind-interval=42s + `) + err := c.Read(reader) + if err != nil { + t.Fatal("Cannot parse scheduler config: " + err.Error()) + } + + assert.Equal(42*time.Second, c.OfferTTL.Duration) + assert.Equal(42*time.Second, c.OfferLingerTTL.Duration) + assert.Equal(42*time.Second, c.ListenerDelay.Duration) + assert.Equal(42, c.UpdatesBacklog) + assert.Equal(42*time.Second, c.FrameworkIdRefreshInterval.Duration) + assert.Equal(42*time.Second, c.InitialImplicitReconciliationDelay.Duration) + assert.Equal(42*time.Second, c.ExplicitReconciliationMaxBackoff.Duration) + assert.Equal(42*time.Second, c.ExplicitReconciliationAbortTimeout.Duration) + assert.Equal(42*time.Second, c.InitialPodBackoff.Duration) + assert.Equal(42*time.Second, c.MaxPodBackoff.Duration) + assert.Equal(42*time.Second, c.HttpHandlerTimeout.Duration) + assert.Equal(42*time.Second, c.HttpBindInterval.Duration) +} + +// check that an invalid config is rejected and non of the values to overwritten +func TestConfig_ReadError(t *testing.T) { + assert := assert.New(t) + + c := CreateDefaultConfig() + reader := strings.NewReader(` + [scheduler] + offer-ttl = 42s + invalid-setting = 42s + `) + err := c.Read(reader) + if err == nil { + t.Fatal("Invalid scheduler config should lead to an error") + } + + assert.NotEqual(42*time.Second, c.OfferTTL.Duration) +} diff --git a/contrib/mesos/pkg/scheduler/config/doc.go b/contrib/mesos/pkg/scheduler/config/doc.go new file mode 100644 index 00000000000..7ce9a982e95 --- /dev/null +++ b/contrib/mesos/pkg/scheduler/config/doc.go @@ -0,0 +1,18 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package config provides mechanisms for low-level scheduler tuning. +package config diff --git a/contrib/mesos/pkg/scheduler/constraint/constraint.go b/contrib/mesos/pkg/scheduler/constraint/constraint.go new file mode 100644 index 00000000000..a2a90b3c377 --- /dev/null +++ b/contrib/mesos/pkg/scheduler/constraint/constraint.go @@ -0,0 +1,106 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package constraint + +import ( + "encoding/json" + "fmt" +) + +type OperatorType int + +const ( + UniqueOperator OperatorType = iota + LikeOperator + ClusterOperator + GroupByOperator + UnlikeOperator +) + +var ( + labels = []string{ + "UNIQUE", + "LIKE", + "CLUSTER", + "GROUP_BY", + "UNLIKE", + } + + labelToType map[string]OperatorType +) + +func init() { + labelToType = make(map[string]OperatorType) + for i, s := range labels { + labelToType[s] = OperatorType(i) + } +} + +func (t OperatorType) String() string { + switch t { + case UniqueOperator, LikeOperator, ClusterOperator, GroupByOperator, UnlikeOperator: + return labels[int(t)] + default: + panic(fmt.Sprintf("unrecognized operator type: %d", int(t))) + } +} + +func parseOperatorType(s string) (OperatorType, error) { + t, found := labelToType[s] + if !found { + return UniqueOperator, fmt.Errorf("unrecognized operator %q", s) + } + return t, nil +} + +type Constraint struct { + Field string // required + Operator OperatorType // required + Value string // optional +} + +func (c *Constraint) MarshalJSON() ([]byte, error) { + var a []string + if c != nil { + if c.Value != "" { + a = append(a, c.Field, c.Operator.String(), c.Value) + } else { + a = append(a, c.Field, c.Operator.String()) + } + } + return json.Marshal(a) +} + +func (c *Constraint) UnmarshalJSON(buf []byte) (err error) { + var a []string + if err = json.Unmarshal(buf, &a); err != nil { + return err + } + switch x := len(a); { + case x < 2: + err = fmt.Errorf("not enough arguments to form constraint") + case x > 3: + err = fmt.Errorf("too many arguments to form constraint") + case x == 3: + c.Value = a[2] + fallthrough + case x == 2: + c.Field = a[0] + c.Operator, err = parseOperatorType(a[1]) + } + return err +} diff --git a/contrib/mesos/pkg/scheduler/constraint/constraint_test.go b/contrib/mesos/pkg/scheduler/constraint/constraint_test.go new file mode 100644 index 00000000000..2869e2d2109 --- /dev/null +++ b/contrib/mesos/pkg/scheduler/constraint/constraint_test.go @@ -0,0 +1,79 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package constraint + +import ( + "encoding/json" + "testing" +) + +func TestDeserialize(t *testing.T) { + shouldMatch := func(js string, field string, operator OperatorType, value string) (err error) { + constraint := Constraint{} + if err = json.Unmarshal(([]byte)(js), &constraint); err != nil { + return + } + if field != constraint.Field { + t.Fatalf("expected field %q instead of %q", field, constraint.Field) + } + if operator != constraint.Operator { + t.Fatalf("expected operator %v instead of %v", operator, constraint.Operator) + } + if value != constraint.Value { + t.Fatalf("expected value %q instead of %q", value, constraint.Value) + } + return + } + failOnError := func(err error) { + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + } + failOnError(shouldMatch(`["hostname","UNIQUE"]`, "hostname", UniqueOperator, "")) + failOnError(shouldMatch(`["rackid","GROUP_BY","1"]`, "rackid", GroupByOperator, "1")) + failOnError(shouldMatch(`["jdk","LIKE","7"]`, "jdk", LikeOperator, "7")) + failOnError(shouldMatch(`["jdk","UNLIKE","7"]`, "jdk", UnlikeOperator, "7")) + failOnError(shouldMatch(`["bob","CLUSTER","foo"]`, "bob", ClusterOperator, "foo")) + err := shouldMatch(`["bill","NOT_REALLY_AN_OPERATOR","pete"]`, "bill", ClusterOperator, "pete") + if err == nil { + t.Fatalf("expected unmarshalling error for invalid operator") + } +} + +func TestSerialize(t *testing.T) { + shouldMatch := func(expected string, constraint *Constraint) error { + data, err := json.Marshal(constraint) + if err != nil { + return err + } + js := string(data) + if js != expected { + t.Fatalf("expected json %q instead of %q", expected, js) + } + return nil + } + failOnError := func(err error) { + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + } + failOnError(shouldMatch(`["hostname","UNIQUE"]`, &Constraint{"hostname", UniqueOperator, ""})) + failOnError(shouldMatch(`["rackid","GROUP_BY","1"]`, &Constraint{"rackid", GroupByOperator, "1"})) + failOnError(shouldMatch(`["jdk","LIKE","7"]`, &Constraint{"jdk", LikeOperator, "7"})) + failOnError(shouldMatch(`["jdk","UNLIKE","7"]`, &Constraint{"jdk", UnlikeOperator, "7"})) + failOnError(shouldMatch(`["bob","CLUSTER","foo"]`, &Constraint{"bob", ClusterOperator, "foo"})) +} diff --git a/contrib/mesos/pkg/scheduler/constraint/doc.go b/contrib/mesos/pkg/scheduler/constraint/doc.go new file mode 100644 index 00000000000..76f021328b0 --- /dev/null +++ b/contrib/mesos/pkg/scheduler/constraint/doc.go @@ -0,0 +1,19 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package constraint exposes Marathon-like constraints for scheduling pods. +// Incomplete. +package constraint diff --git a/contrib/mesos/pkg/scheduler/doc.go b/contrib/mesos/pkg/scheduler/doc.go new file mode 100644 index 00000000000..40552fc1a7d --- /dev/null +++ b/contrib/mesos/pkg/scheduler/doc.go @@ -0,0 +1,18 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package scheduler implements the Kubernetes Mesos scheduler. +package scheduler diff --git a/contrib/mesos/pkg/scheduler/fcfs.go b/contrib/mesos/pkg/scheduler/fcfs.go new file mode 100644 index 00000000000..761c49c362a --- /dev/null +++ b/contrib/mesos/pkg/scheduler/fcfs.go @@ -0,0 +1,57 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package scheduler + +import ( + "fmt" + log "github.com/golang/glog" + + "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/offers" + "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/podtask" +) + +// A first-come-first-serve scheduler: acquires the first offer that can support the task +func FCFSScheduleFunc(r offers.Registry, unused SlaveIndex, task *podtask.T) (offers.Perishable, error) { + podName := fmt.Sprintf("%s/%s", task.Pod.Namespace, task.Pod.Name) + var acceptedOffer offers.Perishable + err := r.Walk(func(p offers.Perishable) (bool, error) { + offer := p.Details() + if offer == nil { + return false, fmt.Errorf("nil offer while scheduling task %v", task.ID) + } + if task.AcceptOffer(offer) { + if p.Acquire() { + acceptedOffer = p + log.V(3).Infof("Pod %s accepted offer %v", podName, offer.Id.GetValue()) + return true, nil // stop, we found an offer + } + } + return false, nil // continue + }) + if acceptedOffer != nil { + if err != nil { + log.Warningf("problems walking the offer registry: %v, attempting to continue", err) + } + return acceptedOffer, nil + } + if err != nil { + log.V(2).Infof("failed to find a fit for pod: %s, err = %v", podName, err) + return nil, err + } + log.V(2).Infof("failed to find a fit for pod: %s", podName) + return nil, noSuitableOffersErr +} diff --git a/contrib/mesos/pkg/scheduler/ha/doc.go b/contrib/mesos/pkg/scheduler/ha/doc.go new file mode 100644 index 00000000000..4e6fc0beda5 --- /dev/null +++ b/contrib/mesos/pkg/scheduler/ha/doc.go @@ -0,0 +1,18 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package ha encapsulates high-availability scheduler concerns. +package ha diff --git a/contrib/mesos/pkg/scheduler/ha/election.go b/contrib/mesos/pkg/scheduler/ha/election.go new file mode 100644 index 00000000000..588b2ba5f6b --- /dev/null +++ b/contrib/mesos/pkg/scheduler/ha/election.go @@ -0,0 +1,73 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package ha + +import ( + "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/election" + log "github.com/golang/glog" +) + +type roleType int + +const ( + followerRole roleType = iota + masterRole + retiredRole +) + +type candidateService struct { + sched *SchedulerProcess + newDriver DriverFactory + role roleType + valid ValidationFunc +} + +type ValidationFunc func(desiredUid, currentUid string) + +func NewCandidate(s *SchedulerProcess, f DriverFactory, v ValidationFunc) election.Service { + return &candidateService{ + sched: s, + newDriver: f, + role: followerRole, + valid: v, + } +} + +func (self *candidateService) Validate(desired, current election.Master) { + if self.valid != nil { + self.valid(string(desired), string(current)) + } +} + +func (self *candidateService) Start() { + if self.role == followerRole { + log.Info("elected as master") + self.role = masterRole + self.sched.Elect(self.newDriver) + } +} + +func (self *candidateService) Stop() { + if self.role == masterRole { + log.Info("retiring from master") + self.role = retiredRole + // order is important here, watchers of a SchedulerProcess will + // check SchedulerProcess.Failover() once Done() is closed. + close(self.sched.failover) + self.sched.End() + } +} diff --git a/contrib/mesos/pkg/scheduler/ha/ha.go b/contrib/mesos/pkg/scheduler/ha/ha.go new file mode 100644 index 00000000000..cdfc0c0c5cf --- /dev/null +++ b/contrib/mesos/pkg/scheduler/ha/ha.go @@ -0,0 +1,285 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package ha + +import ( + "fmt" + "sync/atomic" + + "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/proc" + "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/runtime" + log "github.com/golang/glog" + mesos "github.com/mesos/mesos-go/mesosproto" + bindings "github.com/mesos/mesos-go/scheduler" +) + +type DriverFactory func() (bindings.SchedulerDriver, error) + +type stageType int32 + +const ( + initStage stageType = iota + standbyStage + masterStage + finStage +) + +func (stage *stageType) transition(from, to stageType) bool { + return atomic.CompareAndSwapInt32((*int32)(stage), int32(from), int32(to)) +} + +func (s *stageType) transitionTo(to stageType, unless ...stageType) bool { + if len(unless) == 0 { + atomic.StoreInt32((*int32)(s), int32(to)) + return true + } + for { + state := s.get() + for _, x := range unless { + if state == x { + return false + } + } + if s.transition(state, to) { + return true + } + } +} + +func (stage *stageType) get() stageType { + return stageType(atomic.LoadInt32((*int32)(stage))) +} + +// execute some action in the deferred context of the process, but only if we +// match the stage of the process at the time the action is executed. +func (stage stageType) Do(p *SchedulerProcess, a proc.Action) <-chan error { + errOnce := proc.NewErrorOnce(p.fin) + errOuter := p.Do(proc.Action(func() { + switch stage { + case standbyStage: + //await standby signal or death + select { + case <-p.standby: + case <-p.Done(): + } + case masterStage: + //await elected signal or death + select { + case <-p.elected: + case <-p.Done(): + } + case finStage: + errOnce.Reportf("scheduler process is dying, dropping action") + return + default: + } + errOnce.Report(stage.When(p, a)) + })) + return errOnce.Send(errOuter).Err() +} + +// execute some action only if we match the stage of the scheduler process +func (stage stageType) When(p *SchedulerProcess, a proc.Action) (err error) { + if stage != (&p.stage).get() { + err = fmt.Errorf("failed to execute deferred action, expected lifecycle stage %v instead of %v", stage, p.stage) + } else { + a() + } + return +} + +type SchedulerProcess struct { + proc.Process + bindings.Scheduler + stage stageType + elected chan struct{} // upon close we've been elected + failover chan struct{} // closed indicates that we should failover upon End() + standby chan struct{} + fin chan struct{} +} + +func New(sched bindings.Scheduler) *SchedulerProcess { + p := &SchedulerProcess{ + Process: proc.New(), + Scheduler: sched, + stage: initStage, + elected: make(chan struct{}), + failover: make(chan struct{}), + standby: make(chan struct{}), + fin: make(chan struct{}), + } + runtime.On(p.Running(), p.begin) + return p +} + +func (self *SchedulerProcess) begin() { + if (&self.stage).transition(initStage, standbyStage) { + close(self.standby) + log.Infoln("scheduler process entered standby stage") + } else { + log.Errorf("failed to transition from init to standby stage") + } +} + +func (self *SchedulerProcess) End() <-chan struct{} { + if (&self.stage).transitionTo(finStage, finStage) { + defer close(self.fin) + log.Infoln("scheduler process entered fin stage") + } + return self.Process.End() +} + +func (self *SchedulerProcess) Elect(newDriver DriverFactory) { + errOnce := proc.NewErrorOnce(self.fin) + proc.OnError(errOnce.Send(standbyStage.Do(self, proc.Action(func() { + if !(&self.stage).transition(standbyStage, masterStage) { + log.Errorf("failed to transition from standby to master stage, aborting") + self.End() + return + } + log.Infoln("scheduler process entered master stage") + drv, err := newDriver() + if err != nil { + log.Errorf("failed to fetch scheduler driver: %v", err) + self.End() + return + } + log.V(1).Infoln("starting driver...") + stat, err := drv.Start() + if stat == mesos.Status_DRIVER_RUNNING && err == nil { + log.Infoln("driver started successfully and is running") + close(self.elected) + go func() { + defer self.End() + _, err := drv.Join() + if err != nil { + log.Errorf("driver failed with error: %v", err) + } + errOnce.Report(err) + }() + return + } + defer self.End() + if err != nil { + log.Errorf("failed to start scheduler driver: %v", err) + } else { + log.Errorf("expected RUNNING status, not %v", stat) + } + }))).Err(), func(err error) { + defer self.End() + log.Errorf("failed to handle election event, aborting: %v", err) + }, self.fin) +} + +func (self *SchedulerProcess) Terminal() <-chan struct{} { + return self.fin +} + +func (self *SchedulerProcess) Elected() <-chan struct{} { + return self.elected +} + +func (self *SchedulerProcess) Failover() <-chan struct{} { + return self.failover +} + +type masterProcess struct { + *SchedulerProcess + doer proc.Doer +} + +func (self *masterProcess) Done() <-chan struct{} { + return self.SchedulerProcess.Terminal() +} + +func (self *masterProcess) Do(a proc.Action) <-chan error { + return self.doer.Do(a) +} + +// returns a Process instance that will only execute a proc.Action if the scheduler is the elected master +func (self *SchedulerProcess) Master() proc.Process { + return &masterProcess{ + SchedulerProcess: self, + doer: proc.DoWith(self, proc.DoerFunc(func(a proc.Action) <-chan error { + return proc.ErrorChan(masterStage.When(self, a)) + })), + } +} + +func (self *SchedulerProcess) logError(ch <-chan error) { + self.OnError(ch, func(err error) { + log.Errorf("failed to execute scheduler action: %v", err) + }) +} + +func (self *SchedulerProcess) Registered(drv bindings.SchedulerDriver, fid *mesos.FrameworkID, mi *mesos.MasterInfo) { + self.logError(self.Master().Do(proc.Action(func() { + self.Scheduler.Registered(drv, fid, mi) + }))) +} + +func (self *SchedulerProcess) Reregistered(drv bindings.SchedulerDriver, mi *mesos.MasterInfo) { + self.logError(self.Master().Do(proc.Action(func() { + self.Scheduler.Reregistered(drv, mi) + }))) +} + +func (self *SchedulerProcess) Disconnected(drv bindings.SchedulerDriver) { + self.logError(self.Master().Do(proc.Action(func() { + self.Scheduler.Disconnected(drv) + }))) +} + +func (self *SchedulerProcess) ResourceOffers(drv bindings.SchedulerDriver, off []*mesos.Offer) { + self.logError(self.Master().Do(proc.Action(func() { + self.Scheduler.ResourceOffers(drv, off) + }))) +} + +func (self *SchedulerProcess) OfferRescinded(drv bindings.SchedulerDriver, oid *mesos.OfferID) { + self.logError(self.Master().Do(proc.Action(func() { + self.Scheduler.OfferRescinded(drv, oid) + }))) +} + +func (self *SchedulerProcess) StatusUpdate(drv bindings.SchedulerDriver, ts *mesos.TaskStatus) { + self.logError(self.Master().Do(proc.Action(func() { + self.Scheduler.StatusUpdate(drv, ts) + }))) +} + +func (self *SchedulerProcess) FrameworkMessage(drv bindings.SchedulerDriver, eid *mesos.ExecutorID, sid *mesos.SlaveID, m string) { + self.logError(self.Master().Do(proc.Action(func() { + self.Scheduler.FrameworkMessage(drv, eid, sid, m) + }))) +} + +func (self *SchedulerProcess) SlaveLost(drv bindings.SchedulerDriver, sid *mesos.SlaveID) { + self.logError(self.Master().Do(proc.Action(func() { + self.Scheduler.SlaveLost(drv, sid) + }))) +} + +func (self *SchedulerProcess) ExecutorLost(drv bindings.SchedulerDriver, eid *mesos.ExecutorID, sid *mesos.SlaveID, x int) { + self.logError(self.Master().Do(proc.Action(func() { + self.Scheduler.ExecutorLost(drv, eid, sid, x) + }))) +} + +func (self *SchedulerProcess) Error(drv bindings.SchedulerDriver, msg string) { + self.Scheduler.Error(drv, msg) +} diff --git a/contrib/mesos/pkg/scheduler/meta/annotations.go b/contrib/mesos/pkg/scheduler/meta/annotations.go new file mode 100644 index 00000000000..5c9bf099182 --- /dev/null +++ b/contrib/mesos/pkg/scheduler/meta/annotations.go @@ -0,0 +1,30 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package meta + +// kubernetes api object annotations +const ( + BindingHostKey = "k8s.mesosphere.io/bindingHost" + TaskIdKey = "k8s.mesosphere.io/taskId" + SlaveIdKey = "k8s.mesosphere.io/slaveId" + OfferIdKey = "k8s.mesosphere.io/offerId" + ExecutorIdKey = "k8s.mesosphere.io/executorId" + PortMappingKeyPrefix = "k8s.mesosphere.io/port_" + PortMappingKeyFormat = PortMappingKeyPrefix + "%s_%d" + PortNameMappingKeyPrefix = "k8s.mesosphere.io/portName_" + PortNameMappingKeyFormat = PortNameMappingKeyPrefix + "%s_%s" +) diff --git a/contrib/mesos/pkg/scheduler/meta/doc.go b/contrib/mesos/pkg/scheduler/meta/doc.go new file mode 100644 index 00000000000..f7e5aeeeacf --- /dev/null +++ b/contrib/mesos/pkg/scheduler/meta/doc.go @@ -0,0 +1,19 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package meta defines framework constants used as keys in k8s annotations +// that are attached to k8s pods +package meta diff --git a/contrib/mesos/pkg/scheduler/meta/store.go b/contrib/mesos/pkg/scheduler/meta/store.go new file mode 100644 index 00000000000..7203a12c948 --- /dev/null +++ b/contrib/mesos/pkg/scheduler/meta/store.go @@ -0,0 +1,24 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package meta + +// keys for things that we store +const ( + //TODO(jdef) this should also be a format instead of a fixed path + FrameworkIDKey = "/mesos/k8sm/frameworkid" + DefaultElectionFormat = "/mesos/k8sm/framework/%s/leader" +) diff --git a/contrib/mesos/pkg/scheduler/metrics/doc.go b/contrib/mesos/pkg/scheduler/metrics/doc.go new file mode 100644 index 00000000000..861c0205c61 --- /dev/null +++ b/contrib/mesos/pkg/scheduler/metrics/doc.go @@ -0,0 +1,18 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package metrics defines and exposes instrumentation metrics of the scheduler. +package metrics diff --git a/contrib/mesos/pkg/scheduler/metrics/metrics.go b/contrib/mesos/pkg/scheduler/metrics/metrics.go new file mode 100644 index 00000000000..d9e6fbaeccd --- /dev/null +++ b/contrib/mesos/pkg/scheduler/metrics/metrics.go @@ -0,0 +1,102 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package metrics + +import ( + "sync" + "time" + + "github.com/prometheus/client_golang/prometheus" +) + +const ( + schedulerSubsystem = "k8sm_scheduler" +) + +var ( + QueueWaitTime = prometheus.NewSummary( + prometheus.SummaryOpts{ + Subsystem: schedulerSubsystem, + Name: "queue_wait_time_microseconds", + Help: "Launch queue wait time in microseconds", + }, + ) + BindLatency = prometheus.NewSummary( + prometheus.SummaryOpts{ + Subsystem: schedulerSubsystem, + Name: "bind_latency_microseconds", + Help: "Latency in microseconds between pod-task launch and pod binding.", + }, + ) + StatusUpdates = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Subsystem: schedulerSubsystem, + Name: "status_updates", + Help: "Counter of TaskStatus updates, broken out by source, reason, state.", + }, + []string{"source", "reason", "state"}, + ) + ReconciliationLatency = prometheus.NewSummary( + prometheus.SummaryOpts{ + Subsystem: schedulerSubsystem, + Name: "reconciliation_latency_microseconds", + Help: "Latency in microseconds to execute explicit task reconciliation.", + }, + ) + ReconciliationRequested = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Subsystem: schedulerSubsystem, + Name: "reconciliation_requested", + Help: "Counter of requested task reconciliations, broken out by kind.", + }, + []string{"kind"}, + ) + ReconciliationExecuted = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Subsystem: schedulerSubsystem, + Name: "reconciliation_executed", + Help: "Counter of executed task reconciliations requests, broken out by kind.", + }, + []string{"kind"}, + ) + ReconciliationCancelled = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Subsystem: schedulerSubsystem, + Name: "reconciliation_cancelled", + Help: "Counter of cancelled task reconciliations requests, broken out by kind.", + }, + []string{"kind"}, + ) +) + +var registerMetrics sync.Once + +func Register() { + registerMetrics.Do(func() { + prometheus.MustRegister(QueueWaitTime) + prometheus.MustRegister(BindLatency) + prometheus.MustRegister(StatusUpdates) + prometheus.MustRegister(ReconciliationLatency) + prometheus.MustRegister(ReconciliationRequested) + prometheus.MustRegister(ReconciliationExecuted) + prometheus.MustRegister(ReconciliationCancelled) + }) +} + +func InMicroseconds(d time.Duration) float64 { + return float64(d.Nanoseconds() / time.Microsecond.Nanoseconds()) +} diff --git a/contrib/mesos/pkg/scheduler/mock_test.go b/contrib/mesos/pkg/scheduler/mock_test.go new file mode 100644 index 00000000000..1dbb9da78d0 --- /dev/null +++ b/contrib/mesos/pkg/scheduler/mock_test.go @@ -0,0 +1,203 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package scheduler + +import ( + "sync" + "testing" + + "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/offers" + "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/podtask" + "github.com/GoogleCloudPlatform/kubernetes/pkg/api" + mesos "github.com/mesos/mesos-go/mesosproto" + "github.com/stretchr/testify/mock" +) + +// implements SchedulerInterface +type MockScheduler struct { + sync.RWMutex + mock.Mock +} + +func (m *MockScheduler) slaveFor(id string) (slave *Slave, ok bool) { + args := m.Called(id) + x := args.Get(0) + if x != nil { + slave = x.(*Slave) + } + ok = args.Bool(1) + return +} +func (m *MockScheduler) algorithm() (f PodScheduleFunc) { + args := m.Called() + x := args.Get(0) + if x != nil { + f = x.(PodScheduleFunc) + } + return +} +func (m *MockScheduler) createPodTask(ctx api.Context, pod *api.Pod) (task *podtask.T, err error) { + args := m.Called(ctx, pod) + x := args.Get(0) + if x != nil { + task = x.(*podtask.T) + } + err = args.Error(1) + return +} +func (m *MockScheduler) offers() (f offers.Registry) { + args := m.Called() + x := args.Get(0) + if x != nil { + f = x.(offers.Registry) + } + return +} +func (m *MockScheduler) tasks() (f podtask.Registry) { + args := m.Called() + x := args.Get(0) + if x != nil { + f = x.(podtask.Registry) + } + return +} +func (m *MockScheduler) killTask(taskId string) error { + args := m.Called(taskId) + return args.Error(0) +} +func (m *MockScheduler) launchTask(task *podtask.T) error { + args := m.Called(task) + return args.Error(0) +} + +// @deprecated this is a placeholder for me to test the mock package +func TestNoSlavesYet(t *testing.T) { + obj := &MockScheduler{} + obj.On("slaveFor", "foo").Return(nil, false) + obj.slaveFor("foo") + obj.AssertExpectations(t) +} + +/*----------------------------------------------------------------------------- + | + | this really belongs in the mesos-go package, but that's being updated soon + | any way so just keep it here for now unless we *really* need it there. + | + \----------------------------------------------------------------------------- + +// Scheduler defines the interfaces that needed to be implemented. +type Scheduler interface { + Registered(SchedulerDriver, *FrameworkID, *MasterInfo) + Reregistered(SchedulerDriver, *MasterInfo) + Disconnected(SchedulerDriver) + ResourceOffers(SchedulerDriver, []*Offer) + OfferRescinded(SchedulerDriver, *OfferID) + StatusUpdate(SchedulerDriver, *TaskStatus) + FrameworkMessage(SchedulerDriver, *ExecutorID, *SlaveID, string) + SlaveLost(SchedulerDriver, *SlaveID) + ExecutorLost(SchedulerDriver, *ExecutorID, *SlaveID, int) + Error(SchedulerDriver, string) +} +*/ + +func status(args mock.Arguments, at int) (val mesos.Status) { + if x := args.Get(at); x != nil { + val = x.(mesos.Status) + } + return +} + +type extendedMock struct { + mock.Mock +} + +// Upon returns a chan that closes upon the execution of the most recently registered call. +func (m *extendedMock) Upon() <-chan struct{} { + ch := make(chan struct{}) + call := &m.ExpectedCalls[len(m.ExpectedCalls)-1] + f := call.Run + call.Run = func(args mock.Arguments) { + defer close(ch) + if f != nil { + f(args) + } + } + return ch +} + +type MockSchedulerDriver struct { + extendedMock +} + +func (m *MockSchedulerDriver) Init() error { + args := m.Called() + return args.Error(0) +} +func (m *MockSchedulerDriver) Start() (mesos.Status, error) { + args := m.Called() + return status(args, 0), args.Error(1) +} +func (m *MockSchedulerDriver) Stop(b bool) (mesos.Status, error) { + args := m.Called(b) + return status(args, 0), args.Error(1) +} +func (m *MockSchedulerDriver) Abort() (mesos.Status, error) { + args := m.Called() + return status(args, 0), args.Error(1) +} +func (m *MockSchedulerDriver) Join() (mesos.Status, error) { + args := m.Called() + return status(args, 0), args.Error(1) +} +func (m *MockSchedulerDriver) Run() (mesos.Status, error) { + args := m.Called() + return status(args, 0), args.Error(1) +} +func (m *MockSchedulerDriver) RequestResources(r []*mesos.Request) (mesos.Status, error) { + args := m.Called(r) + return status(args, 0), args.Error(1) +} +func (m *MockSchedulerDriver) ReconcileTasks(statuses []*mesos.TaskStatus) (mesos.Status, error) { + args := m.Called(statuses) + return status(args, 0), args.Error(1) +} +func (m *MockSchedulerDriver) LaunchTasks(offerIds []*mesos.OfferID, ti []*mesos.TaskInfo, f *mesos.Filters) (mesos.Status, error) { + args := m.Called(offerIds, ti, f) + return status(args, 0), args.Error(1) +} +func (m *MockSchedulerDriver) KillTask(tid *mesos.TaskID) (mesos.Status, error) { + args := m.Called(tid) + return status(args, 0), args.Error(1) +} +func (m *MockSchedulerDriver) DeclineOffer(oid *mesos.OfferID, f *mesos.Filters) (mesos.Status, error) { + args := m.Called(oid, f) + return status(args, 0), args.Error(1) +} +func (m *MockSchedulerDriver) ReviveOffers() (mesos.Status, error) { + args := m.Called() + return status(args, 0), args.Error(0) +} +func (m *MockSchedulerDriver) SendFrameworkMessage(eid *mesos.ExecutorID, sid *mesos.SlaveID, s string) (mesos.Status, error) { + args := m.Called(eid, sid, s) + return status(args, 0), args.Error(1) +} +func (m *MockSchedulerDriver) Destroy() { + m.Called() +} +func (m *MockSchedulerDriver) Wait() { + m.Called() +} diff --git a/contrib/mesos/pkg/scheduler/plugin.go b/contrib/mesos/pkg/scheduler/plugin.go new file mode 100644 index 00000000000..27027148c02 --- /dev/null +++ b/contrib/mesos/pkg/scheduler/plugin.go @@ -0,0 +1,875 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package scheduler + +import ( + "fmt" + "io" + "net/http" + "strconv" + "sync" + "time" + + "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/backoff" + "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/offers" + "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/queue" + "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/runtime" + annotation "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/meta" + "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/podtask" + "github.com/GoogleCloudPlatform/kubernetes/pkg/api" + "github.com/GoogleCloudPlatform/kubernetes/pkg/api/errors" + "github.com/GoogleCloudPlatform/kubernetes/pkg/client" + "github.com/GoogleCloudPlatform/kubernetes/pkg/client/cache" + "github.com/GoogleCloudPlatform/kubernetes/pkg/client/record" + "github.com/GoogleCloudPlatform/kubernetes/pkg/fields" + "github.com/GoogleCloudPlatform/kubernetes/pkg/util" + plugin "github.com/GoogleCloudPlatform/kubernetes/plugin/pkg/scheduler" + "github.com/GoogleCloudPlatform/kubernetes/plugin/pkg/scheduler/algorithm" + log "github.com/golang/glog" + mesos "github.com/mesos/mesos-go/mesosproto" + mutil "github.com/mesos/mesos-go/mesosutil" +) + +const ( + enqueuePopTimeout = 200 * time.Millisecond + enqueueWaitTimeout = 1 * time.Second + yieldPopTimeout = 200 * time.Millisecond + yieldWaitTimeout = 1 * time.Second + pluginRecoveryDelay = 100 * time.Millisecond // delay after scheduler plugin crashes, before we resume scheduling +) + +// scheduler abstraction to allow for easier unit testing +type schedulerInterface interface { + sync.Locker // synchronize scheduler plugin operations + SlaveIndex + algorithm() PodScheduleFunc + offers() offers.Registry + tasks() podtask.Registry + + // driver calls + + killTask(taskId string) error + launchTask(*podtask.T) error + + // convenience + + createPodTask(api.Context, *api.Pod) (*podtask.T, error) +} + +type k8smScheduler struct { + sync.Mutex + internal *KubernetesScheduler +} + +func (k *k8smScheduler) algorithm() PodScheduleFunc { + return k.internal.scheduleFunc +} + +func (k *k8smScheduler) offers() offers.Registry { + return k.internal.offers +} + +func (k *k8smScheduler) tasks() podtask.Registry { + return k.internal.taskRegistry +} + +func (k *k8smScheduler) createPodTask(ctx api.Context, pod *api.Pod) (*podtask.T, error) { + return podtask.New(ctx, "", *pod, k.internal.executor) +} + +func (k *k8smScheduler) slaveFor(id string) (slave *Slave, ok bool) { + slave, ok = k.internal.slaves.getSlave(id) + return +} + +func (k *k8smScheduler) killTask(taskId string) error { + killTaskId := mutil.NewTaskID(taskId) + _, err := k.internal.driver.KillTask(killTaskId) + return err +} + +func (k *k8smScheduler) launchTask(task *podtask.T) error { + // assume caller is holding scheduler lock + taskList := []*mesos.TaskInfo{task.BuildTaskInfo()} + offerIds := []*mesos.OfferID{task.Offer.Details().Id} + filters := &mesos.Filters{} + _, err := k.internal.driver.LaunchTasks(offerIds, taskList, filters) + return err +} + +type binder struct { + api schedulerInterface +} + +// implements binding.Registry, launches the pod-associated-task in mesos +func (b *binder) Bind(binding *api.Binding) error { + + ctx := api.WithNamespace(api.NewContext(), binding.Namespace) + + // default upstream scheduler passes pod.Name as binding.Name + podKey, err := podtask.MakePodKey(ctx, binding.Name) + if err != nil { + return err + } + + b.api.Lock() + defer b.api.Unlock() + + switch task, state := b.api.tasks().ForPod(podKey); state { + case podtask.StatePending: + return b.bind(ctx, binding, task) + default: + // in this case it's likely that the pod has been deleted between Schedule + // and Bind calls + log.Infof("No pending task for pod %s", podKey) + return noSuchPodErr //TODO(jdef) this error is somewhat misleading since the task could be running?! + } +} + +func (b *binder) rollback(task *podtask.T, err error) error { + task.Offer.Release() + task.Reset() + if err2 := b.api.tasks().Update(task); err2 != nil { + log.Errorf("failed to update pod task: %v", err2) + } + return err +} + +// assumes that: caller has acquired scheduler lock and that the task is still pending +func (b *binder) bind(ctx api.Context, binding *api.Binding, task *podtask.T) (err error) { + // sanity check: ensure that the task hasAcceptedOffer(), it's possible that between + // Schedule() and now that the offer for this task was rescinded or invalidated. + // ((we should never see this here)) + if !task.HasAcceptedOffer() { + return fmt.Errorf("task has not accepted a valid offer %v", task.ID) + } + + // By this time, there is a chance that the slave is disconnected. + offerId := task.GetOfferId() + if offer, ok := b.api.offers().Get(offerId); !ok || offer.HasExpired() { + // already rescinded or timed out or otherwise invalidated + return b.rollback(task, fmt.Errorf("failed prior to launchTask due to expired offer for task %v", task.ID)) + } + + if err = b.prepareTaskForLaunch(ctx, binding.Target.Name, task, offerId); err == nil { + log.V(2).Infof("launching task: %q on target %q slave %q for pod \"%v/%v\"", + task.ID, binding.Target.Name, task.Spec.SlaveID, task.Pod.Namespace, task.Pod.Name) + if err = b.api.launchTask(task); err == nil { + b.api.offers().Invalidate(offerId) + task.Set(podtask.Launched) + if err = b.api.tasks().Update(task); err != nil { + // this should only happen if the task has been removed or has changed status, + // which SHOULD NOT HAPPEN as long as we're synchronizing correctly + log.Errorf("failed to update task w/ Launched status: %v", err) + } + return + } + } + return b.rollback(task, fmt.Errorf("Failed to launch task %v: %v", task.ID, err)) +} + +//TODO(jdef) unit test this, ensure that task's copy of api.Pod is not modified +func (b *binder) prepareTaskForLaunch(ctx api.Context, machine string, task *podtask.T, offerId string) error { + pod := task.Pod + + // we make an effort here to avoid making changes to the task's copy of the pod, since + // we want that to reflect the initial user spec, and not the modified spec that we + // build for the executor to consume. + oemCt := pod.Spec.Containers + pod.Spec.Containers = append([]api.Container{}, oemCt...) // (shallow) clone before mod + + if pod.Annotations == nil { + pod.Annotations = make(map[string]string) + } else { + oemAnn := pod.Annotations + pod.Annotations = make(map[string]string) + for k, v := range oemAnn { + pod.Annotations[k] = v + } + } + pod.Annotations[annotation.BindingHostKey] = machine + task.SaveRecoveryInfo(pod.Annotations) + + for _, entry := range task.Spec.PortMap { + oemPorts := pod.Spec.Containers[entry.ContainerIdx].Ports + ports := append([]api.ContainerPort{}, oemPorts...) + p := &ports[entry.PortIdx] + p.HostPort = int(entry.OfferPort) + op := strconv.FormatUint(entry.OfferPort, 10) + pod.Annotations[fmt.Sprintf(annotation.PortMappingKeyFormat, p.Protocol, p.ContainerPort)] = op + if p.Name != "" { + pod.Annotations[fmt.Sprintf(annotation.PortNameMappingKeyFormat, p.Protocol, p.Name)] = op + } + pod.Spec.Containers[entry.ContainerIdx].Ports = ports + } + + // the kubelet-executor uses this to instantiate the pod + log.V(3).Infof("prepared pod spec: %+v", pod) + + data, err := api.Codec.Encode(&pod) + if err != nil { + log.V(2).Infof("Failed to marshal the pod spec: %v", err) + return err + } + task.Spec.Data = data + return nil +} + +type kubeScheduler struct { + api schedulerInterface + podUpdates queue.FIFO +} + +// Schedule implements the Scheduler interface of Kubernetes. +// It returns the selectedMachine's name and error (if there's any). +func (k *kubeScheduler) Schedule(pod *api.Pod, unused algorithm.MinionLister) (string, error) { + log.Infof("Try to schedule pod %v\n", pod.Name) + ctx := api.WithNamespace(api.NewDefaultContext(), pod.Namespace) + + // default upstream scheduler passes pod.Name as binding.PodID + podKey, err := podtask.MakePodKey(ctx, pod.Name) + if err != nil { + return "", err + } + + k.api.Lock() + defer k.api.Unlock() + + switch task, state := k.api.tasks().ForPod(podKey); state { + case podtask.StateUnknown: + // There's a bit of a potential race here, a pod could have been yielded() and + // then before we get *here* it could be deleted. + // We use meta to index the pod in the store since that's what k8s reflector does. + podName, err := cache.MetaNamespaceKeyFunc(pod) + if err != nil { + log.Warningf("aborting Schedule, unable to understand pod object %+v", pod) + return "", noSuchPodErr + } + if deleted := k.podUpdates.Poll(podName, queue.DELETE_EVENT); deleted { + // avoid scheduling a pod that's been deleted between yieldPod() and Schedule() + log.Infof("aborting Schedule, pod has been deleted %+v", pod) + return "", noSuchPodErr + } + return k.doSchedule(k.api.tasks().Register(k.api.createPodTask(ctx, pod))) + + //TODO(jdef) it's possible that the pod state has diverged from what + //we knew previously, we should probably update the task.Pod state here + //before proceeding with scheduling + case podtask.StatePending: + if pod.UID != task.Pod.UID { + // we're dealing with a brand new pod spec here, so the old one must have been + // deleted -- and so our task store is out of sync w/ respect to reality + //TODO(jdef) reconcile task + return "", fmt.Errorf("task %v spec is out of sync with pod %v spec, aborting schedule", task.ID, pod.Name) + } else if task.Has(podtask.Launched) { + // task has been marked as "launched" but the pod binding creation may have failed in k8s, + // but we're going to let someone else handle it, probably the mesos task error handler + return "", fmt.Errorf("task %s has already been launched, aborting schedule", task.ID) + } else { + return k.doSchedule(task, nil) + } + + default: + return "", fmt.Errorf("task %s is not pending, nothing to schedule", task.ID) + } +} + +// Call ScheduleFunc and subtract some resources, returning the name of the machine the task is scheduled on +func (k *kubeScheduler) doSchedule(task *podtask.T, err error) (string, error) { + var offer offers.Perishable + if task.HasAcceptedOffer() { + // verify that the offer is still on the table + offerId := task.GetOfferId() + if offer, ok := k.api.offers().Get(offerId); ok && !offer.HasExpired() { + // skip tasks that have already have assigned offers + offer = task.Offer + } else { + task.Offer.Release() + task.Reset() + if err = k.api.tasks().Update(task); err != nil { + return "", err + } + } + } + if err == nil && offer == nil { + offer, err = k.api.algorithm()(k.api.offers(), k.api, task) + } + if err != nil { + return "", err + } + details := offer.Details() + if details == nil { + return "", fmt.Errorf("offer already invalid/expired for task %v", task.ID) + } + slaveId := details.GetSlaveId().GetValue() + if slave, ok := k.api.slaveFor(slaveId); !ok { + // not much sense in Release()ing the offer here since its owner died + offer.Release() + k.api.offers().Invalidate(details.Id.GetValue()) + return "", fmt.Errorf("Slave disappeared (%v) while scheduling task %v", slaveId, task.ID) + } else { + if task.Offer != nil && task.Offer != offer { + return "", fmt.Errorf("task.offer assignment must be idempotent, task %+v: offer %+v", task, offer) + } + task.Offer = offer + task.FillFromDetails(details) + if err := k.api.tasks().Update(task); err != nil { + offer.Release() + return "", err + } + return slave.HostName, nil + } +} + +type queuer struct { + lock sync.Mutex // shared by condition variables of this struct + podUpdates queue.FIFO // queue of pod updates to be processed + podQueue *queue.DelayFIFO // queue of pods to be scheduled + deltaCond sync.Cond // pod changes are available for processing + unscheduledCond sync.Cond // there are unscheduled pods for processing +} + +func newQueuer(store queue.FIFO) *queuer { + q := &queuer{ + podQueue: queue.NewDelayFIFO(), + podUpdates: store, + } + q.deltaCond.L = &q.lock + q.unscheduledCond.L = &q.lock + return q +} + +func (q *queuer) installDebugHandlers(mux *http.ServeMux) { + mux.HandleFunc("/debug/scheduler/podqueue", func(w http.ResponseWriter, r *http.Request) { + for _, x := range q.podQueue.List() { + if _, err := io.WriteString(w, fmt.Sprintf("%+v\n", x)); err != nil { + break + } + } + }) + mux.HandleFunc("/debug/scheduler/podstore", func(w http.ResponseWriter, r *http.Request) { + for _, x := range q.podUpdates.List() { + if _, err := io.WriteString(w, fmt.Sprintf("%+v\n", x)); err != nil { + break + } + } + }) +} + +// signal that there are probably pod updates waiting to be processed +func (q *queuer) updatesAvailable() { + q.deltaCond.Broadcast() +} + +// delete a pod from the to-be-scheduled queue +func (q *queuer) dequeue(id string) { + q.podQueue.Delete(id) +} + +// re-add a pod to the to-be-scheduled queue, will not overwrite existing pod data (that +// may have already changed). +func (q *queuer) requeue(pod *Pod) { + // use KeepExisting in case the pod has already been updated (can happen if binding fails + // due to constraint voilations); we don't want to overwrite a newer entry with stale data. + q.podQueue.Add(pod, queue.KeepExisting) + q.unscheduledCond.Broadcast() +} + +// same as requeue but calls podQueue.Offer instead of podQueue.Add +func (q *queuer) reoffer(pod *Pod) { + // use KeepExisting in case the pod has already been updated (can happen if binding fails + // due to constraint voilations); we don't want to overwrite a newer entry with stale data. + if q.podQueue.Offer(pod, queue.KeepExisting) { + q.unscheduledCond.Broadcast() + } +} + +// spawns a go-routine to watch for unscheduled pods and queue them up +// for scheduling. returns immediately. +func (q *queuer) Run(done <-chan struct{}) { + go runtime.Until(func() { + log.Info("Watching for newly created pods") + q.lock.Lock() + defer q.lock.Unlock() + + for { + // limit blocking here for short intervals so that scheduling + // may proceed even if there have been no recent pod changes + p := q.podUpdates.Await(enqueuePopTimeout) + if p == nil { + signalled := runtime.After(q.deltaCond.Wait) + // we've yielded the lock + select { + case <-time.After(enqueueWaitTimeout): + q.deltaCond.Broadcast() // abort Wait() + <-signalled // wait for lock re-acquisition + log.V(4).Infoln("timed out waiting for a pod update") + case <-signalled: + // we've acquired the lock and there may be + // changes for us to process now + } + continue + } + + pod := p.(*Pod) + if pod.Spec.NodeName != "" { + log.V(3).Infof("dequeuing pod for scheduling: %v", pod.Pod.Name) + q.dequeue(pod.GetUID()) + } else { + // use ReplaceExisting because we are always pushing the latest state + now := time.Now() + pod.deadline = &now + if q.podQueue.Offer(pod, queue.ReplaceExisting) { + q.unscheduledCond.Broadcast() + log.V(3).Infof("queued pod for scheduling: %v", pod.Pod.Name) + } else { + log.Warningf("failed to queue pod for scheduling: %v", pod.Pod.Name) + } + } + } + }, 1*time.Second, done) +} + +// implementation of scheduling plugin's NextPod func; see k8s plugin/pkg/scheduler +func (q *queuer) yield() *api.Pod { + log.V(2).Info("attempting to yield a pod") + q.lock.Lock() + defer q.lock.Unlock() + + for { + // limit blocking here to short intervals so that we don't block the + // enqueuer Run() routine for very long + kpod := q.podQueue.Await(yieldPopTimeout) + if kpod == nil { + signalled := runtime.After(q.unscheduledCond.Wait) + // lock is yielded at this point and we're going to wait for either + // a timeout, or a signal that there's data + select { + case <-time.After(yieldWaitTimeout): + q.unscheduledCond.Broadcast() // abort Wait() + <-signalled // wait for the go-routine, and the lock + log.V(4).Infoln("timed out waiting for a pod to yield") + case <-signalled: + // we have acquired the lock, and there + // may be a pod for us to pop now + } + continue + } + + pod := kpod.(*Pod).Pod + if podName, err := cache.MetaNamespaceKeyFunc(pod); err != nil { + log.Warningf("yield unable to understand pod object %+v, will skip: %v", pod, err) + } else if !q.podUpdates.Poll(podName, queue.POP_EVENT) { + log.V(1).Infof("yield popped a transitioning pod, skipping: %+v", pod) + } else if pod.Spec.NodeName != "" { + // should never happen if enqueuePods is filtering properly + log.Warningf("yield popped an already-scheduled pod, skipping: %+v", pod) + } else { + return pod + } + } +} + +type errorHandler struct { + api schedulerInterface + backoff *backoff.Backoff + qr *queuer +} + +// implementation of scheduling plugin's Error func; see plugin/pkg/scheduler +func (k *errorHandler) handleSchedulingError(pod *api.Pod, schedulingErr error) { + + if schedulingErr == noSuchPodErr { + log.V(2).Infof("Not rescheduling non-existent pod %v", pod.Name) + return + } + + log.Infof("Error scheduling %v: %v; retrying", pod.Name, schedulingErr) + defer util.HandleCrash() + + // default upstream scheduler passes pod.Name as binding.PodID + ctx := api.WithNamespace(api.NewDefaultContext(), pod.Namespace) + podKey, err := podtask.MakePodKey(ctx, pod.Name) + if err != nil { + log.Errorf("Failed to construct pod key, aborting scheduling for pod %v: %v", pod.Name, err) + return + } + + k.backoff.GC() + k.api.Lock() + defer k.api.Unlock() + + switch task, state := k.api.tasks().ForPod(podKey); state { + case podtask.StateUnknown: + // if we don't have a mapping here any more then someone deleted the pod + log.V(2).Infof("Could not resolve pod to task, aborting pod reschdule: %s", podKey) + return + + case podtask.StatePending: + if task.Has(podtask.Launched) { + log.V(2).Infof("Skipping re-scheduling for already-launched pod %v", podKey) + return + } + breakoutEarly := queue.BreakChan(nil) + if schedulingErr == noSuitableOffersErr { + log.V(3).Infof("adding backoff breakout handler for pod %v", podKey) + breakoutEarly = queue.BreakChan(k.api.offers().Listen(podKey, func(offer *mesos.Offer) bool { + k.api.Lock() + defer k.api.Unlock() + switch task, state := k.api.tasks().Get(task.ID); state { + case podtask.StatePending: + return !task.Has(podtask.Launched) && task.AcceptOffer(offer) + default: + // no point in continuing to check for matching offers + return true + } + })) + } + delay := k.backoff.Get(podKey) + log.V(3).Infof("requeuing pod %v with delay %v", podKey, delay) + k.qr.requeue(&Pod{Pod: pod, delay: &delay, notify: breakoutEarly}) + + default: + log.V(2).Infof("Task is no longer pending, aborting reschedule for pod %v", podKey) + } +} + +type deleter struct { + api schedulerInterface + qr *queuer +} + +// currently monitors for "pod deleted" events, upon which handle() +// is invoked. +func (k *deleter) Run(updates <-chan queue.Entry, done <-chan struct{}) { + go runtime.Until(func() { + for { + entry := <-updates + pod := entry.Value().(*Pod) + if entry.Is(queue.DELETE_EVENT) { + if err := k.deleteOne(pod); err != nil { + log.Error(err) + } + } else if !entry.Is(queue.POP_EVENT) { + k.qr.updatesAvailable() + } + } + }, 1*time.Second, done) +} + +func (k *deleter) deleteOne(pod *Pod) error { + ctx := api.WithNamespace(api.NewDefaultContext(), pod.Namespace) + podKey, err := podtask.MakePodKey(ctx, pod.Name) + if err != nil { + return err + } + + log.V(2).Infof("pod deleted: %v", podKey) + + // order is important here: we want to make sure we have the lock before + // removing the pod from the scheduling queue. this makes the concurrent + // execution of scheduler-error-handling and delete-handling easier to + // reason about. + k.api.Lock() + defer k.api.Unlock() + + // prevent the scheduler from attempting to pop this; it's also possible that + // it's concurrently being scheduled (somewhere between pod scheduling and + // binding) - if so, then we'll end up removing it from taskRegistry which + // will abort Bind()ing + k.qr.dequeue(pod.GetUID()) + + switch task, state := k.api.tasks().ForPod(podKey); state { + case podtask.StateUnknown: + log.V(2).Infof("Could not resolve pod '%s' to task id", podKey) + return noSuchPodErr + + // determine if the task has already been launched to mesos, if not then + // cleanup is easier (unregister) since there's no state to sync + case podtask.StatePending: + if !task.Has(podtask.Launched) { + // we've been invoked in between Schedule() and Bind() + if task.HasAcceptedOffer() { + task.Offer.Release() + task.Reset() + task.Set(podtask.Deleted) + //TODO(jdef) probably want better handling here + if err := k.api.tasks().Update(task); err != nil { + return err + } + } + k.api.tasks().Unregister(task) + return nil + } + fallthrough + + case podtask.StateRunning: + // signal to watchers that the related pod is going down + task.Set(podtask.Deleted) + if err := k.api.tasks().Update(task); err != nil { + log.Errorf("failed to update task w/ Deleted status: %v", err) + } + return k.api.killTask(task.ID) + + default: + log.Infof("cannot kill pod '%s': non-terminal task not found %v", podKey, task.ID) + return noSuchTaskErr + } +} + +// Create creates a scheduler plugin and all supporting background functions. +func (k *KubernetesScheduler) NewDefaultPluginConfig(terminate <-chan struct{}, mux *http.ServeMux) *PluginConfig { + // use ListWatch watching pods using the client by default + return k.NewPluginConfig(terminate, mux, createAllPodsLW(k.client)) +} + +func (k *KubernetesScheduler) NewPluginConfig(terminate <-chan struct{}, mux *http.ServeMux, + podsWatcher *cache.ListWatch) *PluginConfig { + + // Watch and queue pods that need scheduling. + updates := make(chan queue.Entry, k.schedcfg.UpdatesBacklog) + podUpdates := &podStoreAdapter{queue.NewHistorical(updates)} + reflector := cache.NewReflector(podsWatcher, &api.Pod{}, podUpdates, 0) + + // lock that guards critial sections that involve transferring pods from + // the store (cache) to the scheduling queue; its purpose is to maintain + // an ordering (vs interleaving) of operations that's easier to reason about. + kapi := &k8smScheduler{internal: k} + q := newQueuer(podUpdates) + podDeleter := &deleter{ + api: kapi, + qr: q, + } + eh := &errorHandler{ + api: kapi, + backoff: backoff.New(k.schedcfg.InitialPodBackoff.Duration, k.schedcfg.MaxPodBackoff.Duration), + qr: q, + } + startLatch := make(chan struct{}) + eventBroadcaster := record.NewBroadcaster() + runtime.On(startLatch, func() { + eventBroadcaster.StartRecordingToSink(k.client.Events("")) + reflector.Run() // TODO(jdef) should listen for termination + podDeleter.Run(updates, terminate) + q.Run(terminate) + + q.installDebugHandlers(mux) + podtask.InstallDebugHandlers(k.taskRegistry, mux) + }) + return &PluginConfig{ + Config: &plugin.Config{ + MinionLister: nil, + Algorithm: &kubeScheduler{ + api: kapi, + podUpdates: podUpdates, + }, + Binder: &binder{api: kapi}, + NextPod: q.yield, + Error: eh.handleSchedulingError, + Recorder: eventBroadcaster.NewRecorder(api.EventSource{Component: "scheduler"}), + }, + api: kapi, + client: k.client, + qr: q, + deleter: podDeleter, + starting: startLatch, + } +} + +type PluginConfig struct { + *plugin.Config + api schedulerInterface + client *client.Client + qr *queuer + deleter *deleter + starting chan struct{} // startup latch +} + +func NewPlugin(c *PluginConfig) PluginInterface { + return &schedulingPlugin{ + config: c.Config, + api: c.api, + client: c.client, + qr: c.qr, + deleter: c.deleter, + starting: c.starting, + } +} + +type schedulingPlugin struct { + config *plugin.Config + api schedulerInterface + client *client.Client + qr *queuer + deleter *deleter + starting chan struct{} +} + +func (s *schedulingPlugin) Run(done <-chan struct{}) { + defer close(s.starting) + go runtime.Until(s.scheduleOne, pluginRecoveryDelay, done) +} + +// hacked from GoogleCloudPlatform/kubernetes/plugin/pkg/scheduler/scheduler.go, +// with the Modeler stuff removed since we don't use it because we have mesos. +func (s *schedulingPlugin) scheduleOne() { + pod := s.config.NextPod() + log.V(3).Infof("Attempting to schedule: %v", pod) + dest, err := s.config.Algorithm.Schedule(pod, s.config.MinionLister) // call kubeScheduler.Schedule + if err != nil { + log.V(1).Infof("Failed to schedule: %v", pod) + s.config.Recorder.Eventf(pod, "failedScheduling", "Error scheduling: %v", err) + s.config.Error(pod, err) + return + } + b := &api.Binding{ + ObjectMeta: api.ObjectMeta{Namespace: pod.Namespace, Name: pod.Name}, + Target: api.ObjectReference{ + Kind: "Node", + Name: dest, + }, + } + if err := s.config.Binder.Bind(b); err != nil { + log.V(1).Infof("Failed to bind pod: %v", err) + s.config.Recorder.Eventf(pod, "failedScheduling", "Binding rejected: %v", err) + s.config.Error(pod, err) + return + } + s.config.Recorder.Eventf(pod, "scheduled", "Successfully assigned %v to %v", pod.Name, dest) +} + +// this pod may be out of sync with respect to the API server registry: +// this pod | apiserver registry +// -------------|---------------------- +// host=.* | 404 ; pod was deleted +// host=.* | 5xx ; failed to sync, try again later? +// host="" | host="" ; perhaps no updates to process? +// host="" | host="..." ; pod has been scheduled and assigned, is there a task assigned? (check TaskIdKey in binding?) +// host="..." | host="" ; pod is no longer scheduled, does it need to be re-queued? +// host="..." | host="..." ; perhaps no updates to process? +// +// TODO(jdef) this needs an integration test +func (s *schedulingPlugin) reconcilePod(oldPod api.Pod) { + log.V(1).Infof("reconcile pod %v", oldPod.Name) + ctx := api.WithNamespace(api.NewDefaultContext(), oldPod.Namespace) + pod, err := s.client.Pods(api.NamespaceValue(ctx)).Get(oldPod.Name) + if err != nil { + if errors.IsNotFound(err) { + // attempt to delete + if err = s.deleter.deleteOne(&Pod{Pod: &oldPod}); err != nil && err != noSuchPodErr && err != noSuchTaskErr { + log.Errorf("failed to delete pod: %v: %v", oldPod.Name, err) + } + } else { + //TODO(jdef) other errors should probably trigger a retry (w/ backoff). + //For now, drop the pod on the floor + log.Warning("aborting reconciliation for pod %v: %v", oldPod.Name, err) + } + return + } + if oldPod.Spec.NodeName != pod.Spec.NodeName { + if pod.Spec.NodeName == "" { + // pod is unscheduled. + // it's possible that we dropped the pod in the scheduler error handler + // because of task misalignment with the pod (task.Has(podtask.Launched) == true) + + podKey, err := podtask.MakePodKey(ctx, pod.Name) + if err != nil { + log.Error(err) + return + } + + s.api.Lock() + defer s.api.Unlock() + + if _, state := s.api.tasks().ForPod(podKey); state != podtask.StateUnknown { + //TODO(jdef) reconcile the task + log.Errorf("task already registered for pod %v", pod.Name) + return + } + + now := time.Now() + log.V(3).Infof("reoffering pod %v", podKey) + s.qr.reoffer(&Pod{ + Pod: pod, + deadline: &now, + }) + } else { + // pod is scheduled. + // not sure how this happened behind our backs. attempt to reconstruct + // at least a partial podtask.T record. + //TODO(jdef) reconcile the task + log.Errorf("pod already scheduled: %v", pod.Name) + } + } else { + //TODO(jdef) for now, ignore the fact that the rest of the spec may be different + //and assume that our knowledge of the pod aligns with that of the apiserver + log.Error("pod reconciliation does not support updates; not yet implemented") + } +} + +func parseSelectorOrDie(s string) fields.Selector { + selector, err := fields.ParseSelector(s) + if err != nil { + panic(err) + } + return selector +} + +// createAllPodsLW returns a listWatch that finds all pods +func createAllPodsLW(cl *client.Client) *cache.ListWatch { + return cache.NewListWatchFromClient(cl, "pods", api.NamespaceAll, parseSelectorOrDie("")) +} + +// Consumes *api.Pod, produces *Pod; the k8s reflector wants to push *api.Pod +// objects at us, but we want to store more flexible (Pod) type defined in +// this package. The adapter implementation facilitates this. It's a little +// hackish since the object type going in is different than the object type +// coming out -- you've been warned. +type podStoreAdapter struct { + queue.FIFO +} + +func (psa *podStoreAdapter) Add(obj interface{}) error { + pod := obj.(*api.Pod) + return psa.FIFO.Add(&Pod{Pod: pod}) +} + +func (psa *podStoreAdapter) Update(obj interface{}) error { + pod := obj.(*api.Pod) + return psa.FIFO.Update(&Pod{Pod: pod}) +} + +func (psa *podStoreAdapter) Delete(obj interface{}) error { + pod := obj.(*api.Pod) + return psa.FIFO.Delete(&Pod{Pod: pod}) +} + +func (psa *podStoreAdapter) Get(obj interface{}) (interface{}, bool, error) { + pod := obj.(*api.Pod) + return psa.FIFO.Get(&Pod{Pod: pod}) +} + +// Replace will delete the contents of the store, using instead the +// given map. This store implementation does NOT take ownership of the map. +func (psa *podStoreAdapter) Replace(objs []interface{}) error { + newobjs := make([]interface{}, len(objs)) + for i, v := range objs { + pod := v.(*api.Pod) + newobjs[i] = &Pod{Pod: pod} + } + return psa.FIFO.Replace(newobjs) +} diff --git a/contrib/mesos/pkg/scheduler/plugin_test.go b/contrib/mesos/pkg/scheduler/plugin_test.go new file mode 100644 index 00000000000..637086b2bd9 --- /dev/null +++ b/contrib/mesos/pkg/scheduler/plugin_test.go @@ -0,0 +1,700 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package scheduler + +import ( + "fmt" + "net/http" + "net/http/httptest" + "sync" + "testing" + "time" + + "github.com/GoogleCloudPlatform/kubernetes/pkg/api" + "github.com/GoogleCloudPlatform/kubernetes/pkg/api/testapi" + "github.com/GoogleCloudPlatform/kubernetes/pkg/client" + "github.com/GoogleCloudPlatform/kubernetes/pkg/client/cache" + "github.com/GoogleCloudPlatform/kubernetes/pkg/runtime" + kutil "github.com/GoogleCloudPlatform/kubernetes/pkg/util" + "github.com/GoogleCloudPlatform/kubernetes/pkg/watch" + + assertext "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/assert" + "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/executor/messages" + "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/queue" + schedcfg "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/config" + "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/ha" + "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/podtask" + log "github.com/golang/glog" + mesos "github.com/mesos/mesos-go/mesosproto" + util "github.com/mesos/mesos-go/mesosutil" + bindings "github.com/mesos/mesos-go/scheduler" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/mock" +) + +// A apiserver mock which partially mocks the pods API +type TestServer struct { + server *httptest.Server + stats map[string]uint + lock sync.Mutex +} + +func NewTestServer(t *testing.T, namespace string, mockPodListWatch *MockPodsListWatch) *TestServer { + ts := TestServer{ + stats: map[string]uint{}, + } + mux := http.NewServeMux() + + mux.HandleFunc(testapi.ResourcePath("pods", namespace, ""), func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + pods := mockPodListWatch.Pods() + w.Write([]byte(runtime.EncodeOrDie(testapi.Codec(), &pods))) + }) + + podsPrefix := testapi.ResourcePath("pods", namespace, "") + "/" + mux.HandleFunc(podsPrefix, func(w http.ResponseWriter, r *http.Request) { + name := r.URL.Path[len(podsPrefix):] + + // update statistics for this pod + ts.lock.Lock() + defer ts.lock.Unlock() + ts.stats[name] = ts.stats[name] + 1 + + p := mockPodListWatch.GetPod(name) + if p != nil { + w.WriteHeader(http.StatusOK) + w.Write([]byte(runtime.EncodeOrDie(testapi.Codec(), p))) + return + } + w.WriteHeader(http.StatusNotFound) + }) + + mux.HandleFunc(testapi.ResourcePath("events", namespace, ""), func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + }) + + mux.HandleFunc("/", func(res http.ResponseWriter, req *http.Request) { + t.Errorf("unexpected request: %v", req.RequestURI) + res.WriteHeader(http.StatusNotFound) + }) + + ts.server = httptest.NewServer(mux) + return &ts +} +func (ts *TestServer) Stats(name string) uint { + ts.lock.Lock() + defer ts.lock.Unlock() + + return ts.stats[name] +} + +// Create mock of pods ListWatch, usually listening on the apiserver pods watch endpoint +type MockPodsListWatch struct { + ListWatch cache.ListWatch + fakeWatcher *watch.FakeWatcher + list api.PodList + lock sync.Mutex +} + +func NewMockPodsListWatch(initialPodList api.PodList) *MockPodsListWatch { + lw := MockPodsListWatch{ + fakeWatcher: watch.NewFake(), + list: initialPodList, + } + lw.ListWatch = cache.ListWatch{ + WatchFunc: func(resourceVersion string) (watch.Interface, error) { + return lw.fakeWatcher, nil + }, + ListFunc: func() (runtime.Object, error) { + return &lw.list, nil + }, + } + return &lw +} +func (lw *MockPodsListWatch) Pods() api.PodList { + lw.lock.Lock() + defer lw.lock.Unlock() + + return lw.list +} +func (lw *MockPodsListWatch) GetPod(name string) *api.Pod { + lw.lock.Lock() + defer lw.lock.Unlock() + + for _, p := range lw.list.Items { + if p.Name == name { + return &p + } + } + + return nil +} +func (lw *MockPodsListWatch) Add(pod *api.Pod, notify bool) { + lw.lock.Lock() + defer lw.lock.Unlock() + + lw.list.Items = append(lw.list.Items, *pod) + if notify { + lw.fakeWatcher.Add(pod) + } +} +func (lw *MockPodsListWatch) Modify(pod *api.Pod, notify bool) { + lw.lock.Lock() + defer lw.lock.Unlock() + + for i, otherPod := range lw.list.Items { + if otherPod.Name == pod.Name { + lw.list.Items[i] = *pod + if notify { + lw.fakeWatcher.Modify(pod) + } + return + } + } + log.Fatalf("Cannot find pod %v to modify in MockPodsListWatch", pod.Name) +} +func (lw *MockPodsListWatch) Delete(pod *api.Pod, notify bool) { + lw.lock.Lock() + defer lw.lock.Unlock() + + for i, otherPod := range lw.list.Items { + if otherPod.Name == pod.Name { + lw.list.Items = append(lw.list.Items[:i], lw.list.Items[i+1:]...) + if notify { + lw.fakeWatcher.Delete(&otherPod) + } + return + } + } + log.Fatalf("Cannot find pod %v to delete in MockPodsListWatch", pod.Name) +} + +// Create a pod with a given index, requiring one port +func NewTestPod(i int) *api.Pod { + name := fmt.Sprintf("pod%d", i) + return &api.Pod{ + TypeMeta: api.TypeMeta{APIVersion: testapi.Version()}, + ObjectMeta: api.ObjectMeta{ + Name: name, + Namespace: "default", + SelfLink: fmt.Sprintf("http://1.2.3.4/api/v1beta1/pods/%s", name), + }, + Spec: api.PodSpec{ + Containers: []api.Container{ + { + Ports: []api.ContainerPort{ + { + ContainerPort: 8000 + i, + Protocol: api.ProtocolTCP, + }, + }, + }, + }, + }, + Status: api.PodStatus{ + PodIP: fmt.Sprintf("1.2.3.%d", 4+i), + Conditions: []api.PodCondition{ + { + Type: api.PodReady, + Status: api.ConditionTrue, + }, + }, + }, + } +} + +// Offering some cpus and memory and the 8000-9000 port range +func NewTestOffer(i int) *mesos.Offer { + hostname := fmt.Sprintf("h%d", i) + cpus := util.NewScalarResource("cpus", 3.75) + mem := util.NewScalarResource("mem", 940) + var port8000 uint64 = 8000 + var port9000 uint64 = 9000 + ports8000to9000 := mesos.Value_Range{Begin: &port8000, End: &port9000} + ports := util.NewRangesResource("ports", []*mesos.Value_Range{&ports8000to9000}) + return &mesos.Offer{ + Id: util.NewOfferID(fmt.Sprintf("offer%d", i)), + Hostname: &hostname, + SlaveId: util.NewSlaveID(hostname), + Resources: []*mesos.Resource{cpus, mem, ports}, + } +} + +// Add assertions to reason about event streams +type Event struct { + Object runtime.Object + Reason string + Message string +} + +type EventPredicate func(e Event) bool + +type EventAssertions struct { + assert.Assertions +} + +// EventObserver implements record.EventRecorder for the purposes of validation via EventAssertions. +type EventObserver struct { + fifo chan Event +} + +func NewEventObserver() *EventObserver { + return &EventObserver{ + fifo: make(chan Event, 1000), + } +} +func (o *EventObserver) Event(object runtime.Object, reason, message string) { + o.fifo <- Event{Object: object, Reason: reason, Message: message} +} +func (o *EventObserver) Eventf(object runtime.Object, reason, messageFmt string, args ...interface{}) { + o.fifo <- Event{Object: object, Reason: reason, Message: fmt.Sprintf(messageFmt, args...)} +} +func (o *EventObserver) PastEventf(object runtime.Object, timestamp kutil.Time, reason, messageFmt string, args ...interface{}) { + o.fifo <- Event{Object: object, Reason: reason, Message: fmt.Sprintf(messageFmt, args...)} +} + +func (a *EventAssertions) Event(observer *EventObserver, pred EventPredicate, msgAndArgs ...interface{}) bool { + // parse msgAndArgs: first possibly a duration, otherwise a format string with further args + timeout := time.Second * 2 + msg := "event not received" + msgArgStart := 0 + if len(msgAndArgs) > 0 { + switch msgAndArgs[0].(type) { + case time.Duration: + timeout = msgAndArgs[0].(time.Duration) + msgArgStart += 1 + } + } + if len(msgAndArgs) > msgArgStart { + msg = fmt.Sprintf(msgAndArgs[msgArgStart].(string), msgAndArgs[msgArgStart+1:]...) + } + + // watch events + result := make(chan bool) + stop := make(chan struct{}) + go func() { + for { + select { + case e, ok := <-observer.fifo: + if !ok { + result <- false + return + } else if pred(e) { + log.V(3).Infof("found asserted event for reason '%v': %v", e.Reason, e.Message) + result <- true + return + } else { + log.V(5).Infof("ignoring not-asserted event for reason '%v': %v", e.Reason, e.Message) + } + case _, ok := <-stop: + if !ok { + return + } + } + } + }() + defer close(stop) + + // wait for watch to match or timeout + select { + case matched := <-result: + return matched + case <-time.After(timeout): + return a.Fail(msg) + } +} +func (a *EventAssertions) EventWithReason(observer *EventObserver, reason string, msgAndArgs ...interface{}) bool { + return a.Event(observer, func(e Event) bool { + return e.Reason == reason + }, msgAndArgs...) +} + +type joinableDriver struct { + MockSchedulerDriver + joinFunc func() (mesos.Status, error) +} + +// Join invokes joinFunc if it has been set, otherwise blocks forever +func (m *joinableDriver) Join() (mesos.Status, error) { + if m.joinFunc != nil { + return m.joinFunc() + } + select {} +} + +// Create mesos.TaskStatus for a given task +func newTaskStatusForTask(task *mesos.TaskInfo, state mesos.TaskState) *mesos.TaskStatus { + healthy := state == mesos.TaskState_TASK_RUNNING + ts := float64(time.Now().Nanosecond()) / 1000000000.0 + source := mesos.TaskStatus_SOURCE_EXECUTOR + return &mesos.TaskStatus{ + TaskId: task.TaskId, + State: &state, + SlaveId: task.SlaveId, + ExecutorId: task.Executor.ExecutorId, + Timestamp: &ts, + Healthy: &healthy, + Source: &source, + Data: task.Data, + } +} + +// Test to create the scheduler plugin with an empty plugin config +func TestPlugin_New(t *testing.T) { + assert := assert.New(t) + + c := PluginConfig{} + p := NewPlugin(&c) + assert.NotNil(p) +} + +// Test to create the scheduler plugin with the config returned by the scheduler, +// and play through the whole life cycle of the plugin while creating pods, deleting +// and failing them. +func TestPlugin_LifeCycle(t *testing.T) { + assert := &EventAssertions{*assert.New(t)} + + // create a fake pod watch. We use that below to submit new pods to the scheduler + podListWatch := NewMockPodsListWatch(api.PodList{}) + + // create fake apiserver + testApiServer := NewTestServer(t, api.NamespaceDefault, podListWatch) + defer testApiServer.server.Close() + + // create scheduler + testScheduler := New(Config{ + Executor: util.NewExecutorInfo( + util.NewExecutorID("executor-id"), + util.NewCommandInfo("executor-cmd"), + ), + Client: client.NewOrDie(&client.Config{Host: testApiServer.server.URL, Version: testapi.Version()}), + ScheduleFunc: FCFSScheduleFunc, + Schedcfg: *schedcfg.CreateDefaultConfig(), + }) + + assert.NotNil(testScheduler.client, "client is nil") + assert.NotNil(testScheduler.executor, "executor is nil") + assert.NotNil(testScheduler.offers, "offer registry is nil") + + // create scheduler process + schedulerProcess := ha.New(testScheduler) + + // get plugin config from it + c := testScheduler.NewPluginConfig(schedulerProcess.Terminal(), http.DefaultServeMux, &podListWatch.ListWatch) + assert.NotNil(c) + + // make events observable + eventObserver := NewEventObserver() + c.Recorder = eventObserver + + // create plugin + p := NewPlugin(c) + assert.NotNil(p) + + // run plugin + p.Run(schedulerProcess.Terminal()) + defer schedulerProcess.End() + + // init scheduler + err := testScheduler.Init(schedulerProcess.Master(), p, http.DefaultServeMux) + assert.NoError(err) + + // create mock mesos scheduler driver + mockDriver := &joinableDriver{} + mockDriver.On("Start").Return(mesos.Status_DRIVER_RUNNING, nil).Once() + started := mockDriver.Upon() + + mAny := mock.AnythingOfType + mockDriver.On("ReconcileTasks", mAny("[]*mesosproto.TaskStatus")).Return(mesos.Status_DRIVER_RUNNING, nil) + mockDriver.On("SendFrameworkMessage", mAny("*mesosproto.ExecutorID"), mAny("*mesosproto.SlaveID"), mAny("string")). + Return(mesos.Status_DRIVER_RUNNING, nil) + + launchedTasks := make(chan *mesos.TaskInfo, 1) + launchTasksCalledFunc := func(args mock.Arguments) { + taskInfos := args.Get(1).([]*mesos.TaskInfo) + assert.Equal(1, len(taskInfos)) + launchedTasks <- taskInfos[0] + } + mockDriver.On("LaunchTasks", mAny("[]*mesosproto.OfferID"), mAny("[]*mesosproto.TaskInfo"), mAny("*mesosproto.Filters")). + Return(mesos.Status_DRIVER_RUNNING, nil).Run(launchTasksCalledFunc) + + // elect master with mock driver + driverFactory := ha.DriverFactory(func() (bindings.SchedulerDriver, error) { + return mockDriver, nil + }) + schedulerProcess.Elect(driverFactory) + elected := schedulerProcess.Elected() + + // driver will be started + <-started + + // tell scheduler to be registered + testScheduler.Registered( + mockDriver, + util.NewFrameworkID("kubernetes-id"), + util.NewMasterInfo("master-id", (192<<24)+(168<<16)+(0<<8)+1, 5050), + ) + + // wait for being elected + <-elected + + //TODO(jdef) refactor things above here into a test suite setup of some sort + + // fake new, unscheduled pod + pod1 := NewTestPod(1) + podListWatch.Add(pod1, true) // notify watchers + + // wait for failedScheduling event because there is no offer + assert.EventWithReason(eventObserver, "failedScheduling", "failedScheduling event not received") + + // add some matching offer + offers1 := []*mesos.Offer{NewTestOffer(1)} + testScheduler.ResourceOffers(nil, offers1) + + // and wait for scheduled pod + assert.EventWithReason(eventObserver, "scheduled") + select { + case launchedTask := <-launchedTasks: + // report back that the task has been staged, and then started by mesos + testScheduler.StatusUpdate(mockDriver, newTaskStatusForTask(launchedTask, mesos.TaskState_TASK_STAGING)) + testScheduler.StatusUpdate(mockDriver, newTaskStatusForTask(launchedTask, mesos.TaskState_TASK_RUNNING)) + + // report back that the task has been lost + mockDriver.AssertNumberOfCalls(t, "SendFrameworkMessage", 0) + testScheduler.StatusUpdate(mockDriver, newTaskStatusForTask(launchedTask, mesos.TaskState_TASK_LOST)) + + // and wait that framework message is sent to executor + mockDriver.AssertNumberOfCalls(t, "SendFrameworkMessage", 1) + + case <-time.After(5 * time.Second): + t.Fatalf("timed out waiting for launchTasks call") + } + + // start another pod + podNum := 1 + startPod := func(offers []*mesos.Offer) (*api.Pod, *mesos.TaskInfo) { + podNum = podNum + 1 + + // create pod and matching offer + pod := NewTestPod(podNum) + podListWatch.Add(pod, true) // notify watchers + testScheduler.ResourceOffers(mockDriver, offers) + assert.EventWithReason(eventObserver, "scheduled") + + // wait for driver.launchTasks call + select { + case launchedTask := <-launchedTasks: + testScheduler.StatusUpdate(mockDriver, newTaskStatusForTask(launchedTask, mesos.TaskState_TASK_STAGING)) + testScheduler.StatusUpdate(mockDriver, newTaskStatusForTask(launchedTask, mesos.TaskState_TASK_RUNNING)) + return pod, launchedTask + + case <-time.After(5 * time.Second): + t.Fatal("timed out waiting for launchTasks") + return nil, nil + } + } + + pod, launchedTask := startPod(offers1) + + // mock drvier.KillTask, should be invoked when a pod is deleted + mockDriver.On("KillTask", mAny("*mesosproto.TaskID")).Return(mesos.Status_DRIVER_RUNNING, nil).Run(func(args mock.Arguments) { + killedTaskId := *(args.Get(0).(*mesos.TaskID)) + assert.Equal(*launchedTask.TaskId, killedTaskId, "expected same TaskID as during launch") + }) + killTaskCalled := mockDriver.Upon() + + // stop it again via the apiserver mock + podListWatch.Delete(pod, true) // notify watchers + + // and wait for the driver killTask call with the correct TaskId + select { + case <-killTaskCalled: + // report back that the task is finished + testScheduler.StatusUpdate(mockDriver, newTaskStatusForTask(launchedTask, mesos.TaskState_TASK_FINISHED)) + + case <-time.After(5 * time.Second): + t.Fatal("timed out waiting for KillTask") + } + + // start pods: + // - which are failing while binding, + // - leading to reconciliation + // - with different states on the apiserver + + failPodFromExecutor := func(task *mesos.TaskInfo) { + beforePodLookups := testApiServer.Stats(pod.Name) + status := newTaskStatusForTask(task, mesos.TaskState_TASK_FAILED) + message := messages.CreateBindingFailure + status.Message = &message + testScheduler.StatusUpdate(mockDriver, status) + + // wait until pod is looked up at the apiserver + assertext.EventuallyTrue(t, time.Second, func() bool { + return testApiServer.Stats(pod.Name) == beforePodLookups+1 + }, "expect that reconcilePod will access apiserver for pod %v", pod.Name) + } + + // 1. with pod deleted from the apiserver + pod, launchedTask = startPod(offers1) + podListWatch.Delete(pod, false) // not notifying the watchers + failPodFromExecutor(launchedTask) + + // 2. with pod still on the apiserver, not bound + pod, launchedTask = startPod(offers1) + failPodFromExecutor(launchedTask) + + // 3. with pod still on the apiserver, bound i.e. host!="" + pod, launchedTask = startPod(offers1) + pod.Spec.NodeName = *offers1[0].Hostname + podListWatch.Modify(pod, false) // not notifying the watchers + failPodFromExecutor(launchedTask) + + // 4. with pod still on the apiserver, bound i.e. host!="", notified via ListWatch + pod, launchedTask = startPod(offers1) + pod.Spec.NodeName = *offers1[0].Hostname + podListWatch.Modify(pod, true) // notifying the watchers + time.Sleep(time.Second / 2) + failPodFromExecutor(launchedTask) +} + +func TestDeleteOne_NonexistentPod(t *testing.T) { + assert := assert.New(t) + obj := &MockScheduler{} + reg := podtask.NewInMemoryRegistry() + obj.On("tasks").Return(reg) + + qr := newQueuer(nil) + assert.Equal(0, len(qr.podQueue.List())) + d := &deleter{ + api: obj, + qr: qr, + } + pod := &Pod{Pod: &api.Pod{ + ObjectMeta: api.ObjectMeta{ + Name: "foo", + Namespace: api.NamespaceDefault, + }}} + err := d.deleteOne(pod) + assert.Equal(err, noSuchPodErr) + obj.AssertExpectations(t) +} + +func TestDeleteOne_PendingPod(t *testing.T) { + assert := assert.New(t) + obj := &MockScheduler{} + reg := podtask.NewInMemoryRegistry() + obj.On("tasks").Return(reg) + + pod := &Pod{Pod: &api.Pod{ + ObjectMeta: api.ObjectMeta{ + Name: "foo", + UID: "foo0", + Namespace: api.NamespaceDefault, + }}} + _, err := reg.Register(podtask.New(api.NewDefaultContext(), "bar", *pod.Pod, &mesos.ExecutorInfo{})) + if err != nil { + t.Fatalf("failed to create task: %v", err) + } + + // preconditions + qr := newQueuer(nil) + qr.podQueue.Add(pod, queue.ReplaceExisting) + assert.Equal(1, len(qr.podQueue.List())) + _, found := qr.podQueue.Get("default/foo") + assert.True(found) + + // exec & post conditions + d := &deleter{ + api: obj, + qr: qr, + } + err = d.deleteOne(pod) + assert.Nil(err) + _, found = qr.podQueue.Get("foo0") + assert.False(found) + assert.Equal(0, len(qr.podQueue.List())) + obj.AssertExpectations(t) +} + +func TestDeleteOne_Running(t *testing.T) { + assert := assert.New(t) + obj := &MockScheduler{} + reg := podtask.NewInMemoryRegistry() + obj.On("tasks").Return(reg) + + pod := &Pod{Pod: &api.Pod{ + ObjectMeta: api.ObjectMeta{ + Name: "foo", + UID: "foo0", + Namespace: api.NamespaceDefault, + }}} + task, err := reg.Register(podtask.New(api.NewDefaultContext(), "bar", *pod.Pod, &mesos.ExecutorInfo{})) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + task.Set(podtask.Launched) + err = reg.Update(task) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + // preconditions + qr := newQueuer(nil) + qr.podQueue.Add(pod, queue.ReplaceExisting) + assert.Equal(1, len(qr.podQueue.List())) + _, found := qr.podQueue.Get("default/foo") + assert.True(found) + + obj.On("killTask", task.ID).Return(nil) + + // exec & post conditions + d := &deleter{ + api: obj, + qr: qr, + } + err = d.deleteOne(pod) + assert.Nil(err) + _, found = qr.podQueue.Get("foo0") + assert.False(found) + assert.Equal(0, len(qr.podQueue.List())) + obj.AssertExpectations(t) +} + +func TestDeleteOne_badPodNaming(t *testing.T) { + assert := assert.New(t) + obj := &MockScheduler{} + pod := &Pod{Pod: &api.Pod{}} + d := &deleter{ + api: obj, + qr: newQueuer(nil), + } + + err := d.deleteOne(pod) + assert.NotNil(err) + + pod.Pod.ObjectMeta.Name = "foo" + err = d.deleteOne(pod) + assert.NotNil(err) + + pod.Pod.ObjectMeta.Name = "" + pod.Pod.ObjectMeta.Namespace = "bar" + err = d.deleteOne(pod) + assert.NotNil(err) + + obj.AssertExpectations(t) +} diff --git a/contrib/mesos/pkg/scheduler/pod.go b/contrib/mesos/pkg/scheduler/pod.go new file mode 100644 index 00000000000..4a9a9388b8f --- /dev/null +++ b/contrib/mesos/pkg/scheduler/pod.go @@ -0,0 +1,80 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package scheduler + +import ( + "fmt" + "time" + + "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/queue" + "github.com/GoogleCloudPlatform/kubernetes/pkg/api" + "github.com/GoogleCloudPlatform/kubernetes/pkg/client/cache" +) + +// wrapper for the k8s pod type so that we can define additional methods on a "pod" +type Pod struct { + *api.Pod + deadline *time.Time + delay *time.Duration + notify queue.BreakChan +} + +// implements Copyable +func (p *Pod) Copy() queue.Copyable { + if p == nil { + return nil + } + //TODO(jdef) we may need a better "deep-copy" implementation + pod := *(p.Pod) + return &Pod{Pod: &pod} +} + +// implements Unique +func (p *Pod) GetUID() string { + if id, err := cache.MetaNamespaceKeyFunc(p.Pod); err != nil { + panic(fmt.Sprintf("failed to determine pod id for '%+v'", p.Pod)) + } else { + return id + } +} + +// implements Deadlined +func (dp *Pod) Deadline() (time.Time, bool) { + if dp.deadline != nil { + return *(dp.deadline), true + } + return time.Time{}, false +} + +func (dp *Pod) GetDelay() time.Duration { + if dp.delay != nil { + return *(dp.delay) + } + return 0 +} + +func (p *Pod) Breaker() queue.BreakChan { + return p.notify +} + +func (p *Pod) String() string { + displayDeadline := "" + if deadline, ok := p.Deadline(); ok { + displayDeadline = deadline.String() + } + return fmt.Sprintf("{pod:%v, deadline:%v, delay:%v}", p.Pod.Name, displayDeadline, p.GetDelay()) +} diff --git a/contrib/mesos/pkg/scheduler/podtask/debug.go b/contrib/mesos/pkg/scheduler/podtask/debug.go new file mode 100644 index 00000000000..72d1a6b788d --- /dev/null +++ b/contrib/mesos/pkg/scheduler/podtask/debug.go @@ -0,0 +1,54 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package podtask + +import ( + "fmt" + "io" + "net/http" + + log "github.com/golang/glog" +) + +//TODO(jdef) we use a Locker to guard against concurrent task state changes, but it would be +//really, really nice to avoid doing this. Maybe someday the registry won't return data ptrs +//but plain structs instead. +func InstallDebugHandlers(reg Registry, mux *http.ServeMux) { + mux.HandleFunc("/debug/registry/tasks", func(w http.ResponseWriter, r *http.Request) { + //TODO(jdef) support filtering tasks based on status + alltasks := reg.List(nil) + io.WriteString(w, fmt.Sprintf("task_count=%d\n", len(alltasks))) + for _, task := range alltasks { + if err := func() (err error) { + podName := task.Pod.Name + podNamespace := task.Pod.Namespace + offerId := "" + if task.Offer != nil { + offerId = task.Offer.Id() + } + _, err = io.WriteString(w, fmt.Sprintf("%v\t%v/%v\t%v\t%v\n", task.ID, podNamespace, podName, task.State, offerId)) + return + }(); err != nil { + log.Warningf("aborting debug handler: %v", err) + break // stop listing on I/O errors + } + } + if flusher, ok := w.(http.Flusher); ok { + flusher.Flush() + } + }) +} diff --git a/contrib/mesos/pkg/scheduler/podtask/doc.go b/contrib/mesos/pkg/scheduler/podtask/doc.go new file mode 100644 index 00000000000..7c36ae5116b --- /dev/null +++ b/contrib/mesos/pkg/scheduler/podtask/doc.go @@ -0,0 +1,18 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package podtask maps Kubernetes pods to Mesos tasks. +package podtask diff --git a/contrib/mesos/pkg/scheduler/podtask/leaky.go b/contrib/mesos/pkg/scheduler/podtask/leaky.go new file mode 100644 index 00000000000..a0a66d7edc5 --- /dev/null +++ b/contrib/mesos/pkg/scheduler/podtask/leaky.go @@ -0,0 +1,29 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package podtask + +// Concepts that have leaked to where they should not have. + +import ( + "github.com/GoogleCloudPlatform/kubernetes/pkg/api" + "github.com/GoogleCloudPlatform/kubernetes/pkg/registry/etcd" +) + +// makePodKey constructs etcd paths to pod items enforcing namespace rules. +func MakePodKey(ctx api.Context, id string) (string, error) { + return etcd.MakeEtcdItemKey(ctx, PodPath, id) +} diff --git a/contrib/mesos/pkg/scheduler/podtask/pod_task.go b/contrib/mesos/pkg/scheduler/podtask/pod_task.go new file mode 100644 index 00000000000..79f5edaf719 --- /dev/null +++ b/contrib/mesos/pkg/scheduler/podtask/pod_task.go @@ -0,0 +1,373 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package podtask + +import ( + "fmt" + "strings" + "time" + + "code.google.com/p/go-uuid/uuid" + "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/offers" + annotation "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/meta" + "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/metrics" + "github.com/GoogleCloudPlatform/kubernetes/pkg/api" + "github.com/gogo/protobuf/proto" + log "github.com/golang/glog" + mesos "github.com/mesos/mesos-go/mesosproto" + mutil "github.com/mesos/mesos-go/mesosutil" +) + +const ( + containerCpus = 0.25 // initial CPU allocated for executor + containerMem = 64 // initial MB of memory allocated for executor +) + +type StateType int + +const ( + StatePending StateType = iota + StateRunning + StateFinished + StateUnknown +) + +type FlagType string + +const ( + Launched = FlagType("launched") + Bound = FlagType("bound") + Deleted = FlagType("deleted") +) + +// A struct that describes a pod task. +type T struct { + ID string + Pod api.Pod + Spec Spec + Offer offers.Perishable // thread-safe + State StateType + Flags map[FlagType]struct{} + CreateTime time.Time + UpdatedTime time.Time // time of the most recent StatusUpdate we've seen from the mesos master + + podStatus api.PodStatus + executor *mesos.ExecutorInfo // readonly + podKey string + launchTime time.Time + bindTime time.Time + mapper HostPortMappingType +} + +type Spec struct { + SlaveID string + CPU float64 + Memory float64 + PortMap []HostPortMapping + Ports []uint64 + Data []byte +} + +// mostly-clone this pod task. the clone will actually share the some fields: +// - executor // OK because it's read only +// - Offer // OK because it's guarantees safe concurrent access +func (t *T) Clone() *T { + if t == nil { + return nil + } + + // shallow-copy + clone := *t + + // deep copy + (&t.Spec).copyTo(&clone.Spec) + clone.Flags = map[FlagType]struct{}{} + for k := range t.Flags { + clone.Flags[k] = struct{}{} + } + return &clone +} + +func (old *Spec) copyTo(new *Spec) { + if len(old.PortMap) > 0 { + new.PortMap = append(([]HostPortMapping)(nil), old.PortMap...) + } + if len(old.Ports) > 0 { + new.Ports = append(([]uint64)(nil), old.Ports...) + } + if len(old.Data) > 0 { + new.Data = append(([]byte)(nil), old.Data...) + } +} + +func (t *T) HasAcceptedOffer() bool { + return t.Spec.SlaveID != "" +} + +func (t *T) GetOfferId() string { + if t.Offer == nil { + return "" + } + return t.Offer.Details().Id.GetValue() +} + +func generateTaskName(pod *api.Pod) string { + ns := pod.Namespace + if ns == "" { + ns = api.NamespaceDefault + } + return fmt.Sprintf("%s.%s.pods", pod.Name, ns) +} + +func (t *T) BuildTaskInfo() *mesos.TaskInfo { + info := &mesos.TaskInfo{ + Name: proto.String(generateTaskName(&t.Pod)), + TaskId: mutil.NewTaskID(t.ID), + SlaveId: mutil.NewSlaveID(t.Spec.SlaveID), + Executor: t.executor, + Data: t.Spec.Data, + Resources: []*mesos.Resource{ + mutil.NewScalarResource("cpus", t.Spec.CPU), + mutil.NewScalarResource("mem", t.Spec.Memory), + }, + } + if portsResource := rangeResource("ports", t.Spec.Ports); portsResource != nil { + info.Resources = append(info.Resources, portsResource) + } + return info +} + +// Fill the Spec in the T, should be called during k8s scheduling, +// before binding. +func (t *T) FillFromDetails(details *mesos.Offer) error { + if details == nil { + //programming error + panic("offer details are nil") + } + + log.V(3).Infof("Recording offer(s) %v against pod %v", details.Id, t.Pod.Name) + + t.Spec = Spec{ + SlaveID: details.GetSlaveId().GetValue(), + CPU: containerCpus, + Memory: containerMem, + } + + if mapping, err := t.mapper.Generate(t, details); err != nil { + t.Reset() + return err + } else { + ports := []uint64{} + for _, entry := range mapping { + ports = append(ports, entry.OfferPort) + } + t.Spec.PortMap = mapping + t.Spec.Ports = ports + } + + // hostname needs of the executor needs to match that of the offer, otherwise + // the kubelet node status checker/updater is very unhappy + const HOSTNAME_OVERRIDE_FLAG = "--hostname-override=" + hostname := details.GetHostname() // required field, non-empty + hostnameOverride := HOSTNAME_OVERRIDE_FLAG + hostname + + argv := t.executor.Command.Arguments + overwrite := false + for i, arg := range argv { + if strings.HasPrefix(arg, HOSTNAME_OVERRIDE_FLAG) { + overwrite = true + argv[i] = hostnameOverride + break + } + } + if !overwrite { + t.executor.Command.Arguments = append(argv, hostnameOverride) + } + return nil +} + +// Clear offer-related details from the task, should be called if/when an offer +// has already been assigned to a task but for some reason is no longer valid. +func (t *T) Reset() { + log.V(3).Infof("Clearing offer(s) from pod %v", t.Pod.Name) + t.Offer = nil + t.Spec = Spec{} +} + +func (t *T) AcceptOffer(offer *mesos.Offer) bool { + if offer == nil { + return false + } + var ( + cpus float64 = 0 + mem float64 = 0 + ) + for _, resource := range offer.Resources { + if resource.GetName() == "cpus" { + cpus = *resource.GetScalar().Value + } + + if resource.GetName() == "mem" { + mem = *resource.GetScalar().Value + } + } + if _, err := t.mapper.Generate(t, offer); err != nil { + log.V(3).Info(err) + return false + } + + // for now hard-coded, constant values are used for cpus and mem. This is necessary + // until parent-cgroup integration is finished for mesos and k8sm. Then the k8sm + // executor can become the parent of pods and subsume their resource usage and + // therefore be compliant with expectations of mesos executors w/ respect to + // resource allocation and management. + // + // TODO(jdef): remove hardcoded values and make use of actual pod resource settings + if (cpus < containerCpus) || (mem < containerMem) { + log.V(3).Infof("not enough resources: cpus: %f mem: %f", cpus, mem) + return false + } + return true +} + +func (t *T) Set(f FlagType) { + t.Flags[f] = struct{}{} + if Launched == f { + t.launchTime = time.Now() + queueWaitTime := t.launchTime.Sub(t.CreateTime) + metrics.QueueWaitTime.Observe(metrics.InMicroseconds(queueWaitTime)) + } +} + +func (t *T) Has(f FlagType) (exists bool) { + _, exists = t.Flags[f] + return +} + +func New(ctx api.Context, id string, pod api.Pod, executor *mesos.ExecutorInfo) (*T, error) { + if executor == nil { + return nil, fmt.Errorf("illegal argument: executor was nil") + } + key, err := MakePodKey(ctx, pod.Name) + if err != nil { + return nil, err + } + if id == "" { + id = "pod." + uuid.NewUUID().String() + } + task := &T{ + ID: id, + Pod: pod, + State: StatePending, + podKey: key, + mapper: MappingTypeForPod(&pod), + Flags: make(map[FlagType]struct{}), + executor: proto.Clone(executor).(*mesos.ExecutorInfo), + } + task.CreateTime = time.Now() + return task, nil +} + +func (t *T) SaveRecoveryInfo(dict map[string]string) { + dict[annotation.TaskIdKey] = t.ID + dict[annotation.SlaveIdKey] = t.Spec.SlaveID + dict[annotation.OfferIdKey] = t.Offer.Details().Id.GetValue() + dict[annotation.ExecutorIdKey] = t.executor.ExecutorId.GetValue() +} + +// reconstruct a task from metadata stashed in a pod entry. there are limited pod states that +// support reconstruction. if we expect to be able to reconstruct state but encounter errors +// in the process then those errors are returned. if the pod is in a seemingly valid state but +// otherwise does not support task reconstruction return false. if we're able to reconstruct +// state then return a reconstructed task and true. +// +// at this time task reconstruction is only supported for pods that have been annotated with +// binding metadata, which implies that they've previously been associated with a task and +// that mesos knows about it. +// +// assumes that the pod data comes from the k8s registry and reflects the desired state. +// +func RecoverFrom(pod api.Pod) (*T, bool, error) { + // we only expect annotations if pod has been bound, which implies that it has already + // been scheduled and launched + if pod.Spec.NodeName == "" && len(pod.Annotations) == 0 { + log.V(1).Infof("skipping recovery for unbound pod %v/%v", pod.Namespace, pod.Name) + return nil, false, nil + } + + // only process pods that are not in a terminal state + switch pod.Status.Phase { + case api.PodPending, api.PodRunning, api.PodUnknown: // continue + default: + log.V(1).Infof("skipping recovery for terminal pod %v/%v", pod.Namespace, pod.Name) + return nil, false, nil + } + + ctx := api.WithNamespace(api.NewDefaultContext(), pod.Namespace) + key, err := MakePodKey(ctx, pod.Name) + if err != nil { + return nil, false, err + } + + //TODO(jdef) recover ports (and other resource requirements?) from the pod spec as well + + now := time.Now() + t := &T{ + Pod: pod, + CreateTime: now, + podKey: key, + State: StatePending, // possibly running? mesos will tell us during reconciliation + Flags: make(map[FlagType]struct{}), + mapper: MappingTypeForPod(&pod), + launchTime: now, + bindTime: now, + } + var ( + offerId string + hostname string + ) + for _, k := range []string{ + annotation.BindingHostKey, + annotation.TaskIdKey, + annotation.SlaveIdKey, + annotation.OfferIdKey, + annotation.ExecutorIdKey, + } { + v, found := pod.Annotations[k] + if !found { + return nil, false, fmt.Errorf("incomplete metadata: missing value for pod annotation: %v", k) + } + switch k { + case annotation.BindingHostKey: + hostname = v + case annotation.SlaveIdKey: + t.Spec.SlaveID = v + case annotation.OfferIdKey: + offerId = v + case annotation.TaskIdKey: + t.ID = v + case annotation.ExecutorIdKey: + // this is nowhere near sufficient to re-launch a task, but we really just + // want this for tracking + t.executor = &mesos.ExecutorInfo{ExecutorId: mutil.NewExecutorID(v)} + } + } + t.Offer = offers.Expired(offerId, hostname, 0) + t.Flags[Launched] = struct{}{} + t.Flags[Bound] = struct{}{} + return t, true, nil +} diff --git a/contrib/mesos/pkg/scheduler/podtask/pod_task_test.go b/contrib/mesos/pkg/scheduler/podtask/pod_task_test.go new file mode 100644 index 00000000000..02506c5df9a --- /dev/null +++ b/contrib/mesos/pkg/scheduler/podtask/pod_task_test.go @@ -0,0 +1,153 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package podtask + +import ( + "testing" + + "github.com/GoogleCloudPlatform/kubernetes/pkg/api" + mesos "github.com/mesos/mesos-go/mesosproto" + mutil "github.com/mesos/mesos-go/mesosutil" +) + +const ( + t_min_cpu = 128 + t_min_mem = 128 +) + +func fakePodTask(id string) (*T, error) { + return New(api.NewDefaultContext(), "", api.Pod{ + ObjectMeta: api.ObjectMeta{ + Name: id, + Namespace: api.NamespaceDefault, + }, + }, &mesos.ExecutorInfo{}) +} + +func TestEmptyOffer(t *testing.T) { + t.Parallel() + task, err := fakePodTask("foo") + if err != nil { + t.Fatal(err) + } + if ok := task.AcceptOffer(nil); ok { + t.Fatalf("accepted nil offer") + } + if ok := task.AcceptOffer(&mesos.Offer{}); ok { + t.Fatalf("accepted empty offer") + } +} + +func TestNoPortsInPodOrOffer(t *testing.T) { + t.Parallel() + task, err := fakePodTask("foo") + if err != nil || task == nil { + t.Fatal(err) + } + + offer := &mesos.Offer{ + Resources: []*mesos.Resource{ + mutil.NewScalarResource("cpus", 0.001), + mutil.NewScalarResource("mem", 0.001), + }, + } + if ok := task.AcceptOffer(offer); ok { + t.Fatalf("accepted offer %v:", offer) + } + + offer = &mesos.Offer{ + Resources: []*mesos.Resource{ + mutil.NewScalarResource("cpus", t_min_cpu), + mutil.NewScalarResource("mem", t_min_mem), + }, + } + if ok := task.AcceptOffer(offer); !ok { + t.Fatalf("did not accepted offer %v:", offer) + } +} + +func TestAcceptOfferPorts(t *testing.T) { + t.Parallel() + task, _ := fakePodTask("foo") + pod := &task.Pod + + offer := &mesos.Offer{ + Resources: []*mesos.Resource{ + mutil.NewScalarResource("cpus", t_min_cpu), + mutil.NewScalarResource("mem", t_min_mem), + rangeResource("ports", []uint64{1, 1}), + }, + } + if ok := task.AcceptOffer(offer); !ok { + t.Fatalf("did not accepted offer %v:", offer) + } + + pod.Spec = api.PodSpec{ + Containers: []api.Container{{ + Ports: []api.ContainerPort{{ + HostPort: 123, + }}, + }}, + } + if ok := task.AcceptOffer(offer); ok { + t.Fatalf("accepted offer %v:", offer) + } + + pod.Spec.Containers[0].Ports[0].HostPort = 1 + if ok := task.AcceptOffer(offer); !ok { + t.Fatalf("did not accepted offer %v:", offer) + } + + pod.Spec.Containers[0].Ports[0].HostPort = 0 + if ok := task.AcceptOffer(offer); !ok { + t.Fatalf("did not accepted offer %v:", offer) + } + + offer.Resources = []*mesos.Resource{ + mutil.NewScalarResource("cpus", t_min_cpu), + mutil.NewScalarResource("mem", t_min_mem), + } + if ok := task.AcceptOffer(offer); ok { + t.Fatalf("accepted offer %v:", offer) + } + + pod.Spec.Containers[0].Ports[0].HostPort = 1 + if ok := task.AcceptOffer(offer); ok { + t.Fatalf("accepted offer %v:", offer) + } +} + +func TestGeneratePodName(t *testing.T) { + p := &api.Pod{ + ObjectMeta: api.ObjectMeta{ + Name: "foo", + Namespace: "bar", + }, + } + name := generateTaskName(p) + expected := "foo.bar.pods" + if name != expected { + t.Fatalf("expected %q instead of %q", expected, name) + } + + p.Namespace = "" + name = generateTaskName(p) + expected = "foo.default.pods" + if name != expected { + t.Fatalf("expected %q instead of %q", expected, name) + } +} diff --git a/contrib/mesos/pkg/scheduler/podtask/port_mapping.go b/contrib/mesos/pkg/scheduler/podtask/port_mapping.go new file mode 100644 index 00000000000..9c90ef15b70 --- /dev/null +++ b/contrib/mesos/pkg/scheduler/podtask/port_mapping.go @@ -0,0 +1,185 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package podtask + +import ( + "fmt" + + "github.com/GoogleCloudPlatform/kubernetes/pkg/api" + "github.com/GoogleCloudPlatform/kubernetes/pkg/labels" + log "github.com/golang/glog" + mesos "github.com/mesos/mesos-go/mesosproto" +) + +type HostPortMappingType string + +const ( + // maps a Container.HostPort to the same exact offered host port, ignores .HostPort = 0 + HostPortMappingFixed HostPortMappingType = "fixed" + // same as HostPortMappingFixed, except that .HostPort of 0 are mapped to any port offered + HostPortMappingWildcard = "wildcard" +) + +type HostPortMapper interface { + // abstracts the way that host ports are mapped to pod container ports + Generate(t *T, offer *mesos.Offer) ([]HostPortMapping, error) +} + +type HostPortMapping struct { + ContainerIdx int // index of the container in the pod spec + PortIdx int // index of the port in a container's port spec + OfferPort uint64 +} + +func (self HostPortMappingType) Generate(t *T, offer *mesos.Offer) ([]HostPortMapping, error) { + switch self { + case HostPortMappingWildcard: + return wildcardHostPortMapping(t, offer) + case HostPortMappingFixed: + default: + log.Warningf("illegal host-port mapping spec %q, defaulting to %q", self, HostPortMappingFixed) + } + return defaultHostPortMapping(t, offer) +} + +type PortAllocationError struct { + PodId string + Ports []uint64 +} + +func (err *PortAllocationError) Error() string { + return fmt.Sprintf("Could not schedule pod %s: %d port(s) could not be allocated", err.PodId, len(err.Ports)) +} + +type DuplicateHostPortError struct { + m1, m2 HostPortMapping +} + +func (err *DuplicateHostPortError) Error() string { + return fmt.Sprintf( + "Host port %d is specified for container %d, pod %d and container %d, pod %d", + err.m1.OfferPort, err.m1.ContainerIdx, err.m1.PortIdx, err.m2.ContainerIdx, err.m2.PortIdx) +} + +// wildcard k8s host port mapping implementation: hostPort == 0 gets mapped to any available offer port +func wildcardHostPortMapping(t *T, offer *mesos.Offer) ([]HostPortMapping, error) { + mapping, err := defaultHostPortMapping(t, offer) + if err != nil { + return nil, err + } + taken := make(map[uint64]struct{}) + for _, entry := range mapping { + taken[entry.OfferPort] = struct{}{} + } + wildports := []HostPortMapping{} + for i, container := range t.Pod.Spec.Containers { + for pi, port := range container.Ports { + if port.HostPort == 0 { + wildports = append(wildports, HostPortMapping{ + ContainerIdx: i, + PortIdx: pi, + }) + } + } + } + remaining := len(wildports) + foreachRange(offer, "ports", func(bp, ep uint64) { + log.V(3).Infof("Searching for wildcard port in range {%d:%d}", bp, ep) + for _, entry := range wildports { + if entry.OfferPort != 0 { + continue + } + for port := bp; port <= ep && remaining > 0; port++ { + if _, inuse := taken[port]; inuse { + continue + } + entry.OfferPort = port + mapping = append(mapping, entry) + remaining-- + taken[port] = struct{}{} + break + } + } + }) + if remaining > 0 { + err := &PortAllocationError{ + PodId: t.Pod.Name, + } + // it doesn't make sense to include a port list here because they were all zero (wildcards) + return nil, err + } + return mapping, nil +} + +// default k8s host port mapping implementation: hostPort == 0 means containerPort remains pod-private, and so +// no offer ports will be mapped to such Container ports. +func defaultHostPortMapping(t *T, offer *mesos.Offer) ([]HostPortMapping, error) { + requiredPorts := make(map[uint64]HostPortMapping) + mapping := []HostPortMapping{} + for i, container := range t.Pod.Spec.Containers { + // strip all port==0 from this array; k8s already knows what to do with zero- + // ports (it does not create 'port bindings' on the minion-host); we need to + // remove the wildcards from this array since they don't consume host resources + for pi, port := range container.Ports { + if port.HostPort == 0 { + continue // ignore + } + m := HostPortMapping{ + ContainerIdx: i, + PortIdx: pi, + OfferPort: uint64(port.HostPort), + } + if entry, inuse := requiredPorts[uint64(port.HostPort)]; inuse { + return nil, &DuplicateHostPortError{entry, m} + } + requiredPorts[uint64(port.HostPort)] = m + } + } + foreachRange(offer, "ports", func(bp, ep uint64) { + for port := range requiredPorts { + log.V(3).Infof("evaluating port range {%d:%d} %d", bp, ep, port) + if (bp <= port) && (port <= ep) { + mapping = append(mapping, requiredPorts[port]) + delete(requiredPorts, port) + } + } + }) + unsatisfiedPorts := len(requiredPorts) + if unsatisfiedPorts > 0 { + err := &PortAllocationError{ + PodId: t.Pod.Name, + } + for p := range requiredPorts { + err.Ports = append(err.Ports, p) + } + return nil, err + } + return mapping, nil +} + +const PortMappingLabelKey = "k8s.mesosphere.io/portMapping" + +func MappingTypeForPod(pod *api.Pod) HostPortMappingType { + filter := map[string]string{ + PortMappingLabelKey: string(HostPortMappingFixed), + } + selector := labels.Set(filter).AsSelector() + if selector.Matches(labels.Set(pod.Labels)) { + return HostPortMappingFixed + } + return HostPortMappingWildcard +} diff --git a/contrib/mesos/pkg/scheduler/podtask/port_mapping_test.go b/contrib/mesos/pkg/scheduler/podtask/port_mapping_test.go new file mode 100644 index 00000000000..d5c911f3bf2 --- /dev/null +++ b/contrib/mesos/pkg/scheduler/podtask/port_mapping_test.go @@ -0,0 +1,205 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package podtask + +import ( + "testing" + + "github.com/GoogleCloudPlatform/kubernetes/pkg/api" + mesos "github.com/mesos/mesos-go/mesosproto" +) + +func TestDefaultHostPortMatching(t *testing.T) { + t.Parallel() + task, _ := fakePodTask("foo") + pod := &task.Pod + + offer := &mesos.Offer{ + Resources: []*mesos.Resource{ + rangeResource("ports", []uint64{1, 1}), + }, + } + mapping, err := defaultHostPortMapping(task, offer) + if err != nil { + t.Fatal(err) + } + if len(mapping) > 0 { + t.Fatalf("Found mappings for a pod without ports: %v", pod) + } + + //-- + pod.Spec = api.PodSpec{ + Containers: []api.Container{{ + Ports: []api.ContainerPort{{ + HostPort: 123, + }, { + HostPort: 123, + }}, + }}, + } + task, err = New(api.NewDefaultContext(), "", *pod, &mesos.ExecutorInfo{}) + if err != nil { + t.Fatal(err) + } + _, err = defaultHostPortMapping(task, offer) + if err, _ := err.(*DuplicateHostPortError); err == nil { + t.Fatal("Expected duplicate port error") + } else if err.m1.OfferPort != 123 { + t.Fatal("Expected duplicate host port 123") + } +} + +func TestWildcardHostPortMatching(t *testing.T) { + t.Parallel() + task, _ := fakePodTask("foo") + pod := &task.Pod + + offer := &mesos.Offer{} + mapping, err := wildcardHostPortMapping(task, offer) + if err != nil { + t.Fatal(err) + } + if len(mapping) > 0 { + t.Fatalf("Found mappings for an empty offer and a pod without ports: %v", pod) + } + + //-- + offer = &mesos.Offer{ + Resources: []*mesos.Resource{ + rangeResource("ports", []uint64{1, 1}), + }, + } + mapping, err = wildcardHostPortMapping(task, offer) + if err != nil { + t.Fatal(err) + } + if len(mapping) > 0 { + t.Fatalf("Found mappings for a pod without ports: %v", pod) + } + + //-- + pod.Spec = api.PodSpec{ + Containers: []api.Container{{ + Ports: []api.ContainerPort{{ + HostPort: 123, + }}, + }}, + } + task, err = New(api.NewDefaultContext(), "", *pod, &mesos.ExecutorInfo{}) + if err != nil { + t.Fatal(err) + } + mapping, err = wildcardHostPortMapping(task, offer) + if err == nil { + t.Fatalf("expected error instead of mappings: %#v", mapping) + } else if err, _ := err.(*PortAllocationError); err == nil { + t.Fatal("Expected port allocation error") + } else if !(len(err.Ports) == 1 && err.Ports[0] == 123) { + t.Fatal("Expected port allocation error for host port 123") + } + + //-- + pod.Spec = api.PodSpec{ + Containers: []api.Container{{ + Ports: []api.ContainerPort{{ + HostPort: 0, + }, { + HostPort: 123, + }}, + }}, + } + task, err = New(api.NewDefaultContext(), "", *pod, &mesos.ExecutorInfo{}) + if err != nil { + t.Fatal(err) + } + mapping, err = wildcardHostPortMapping(task, offer) + if err, _ := err.(*PortAllocationError); err == nil { + t.Fatal("Expected port allocation error") + } else if !(len(err.Ports) == 1 && err.Ports[0] == 123) { + t.Fatal("Expected port allocation error for host port 123") + } + + //-- + pod.Spec = api.PodSpec{ + Containers: []api.Container{{ + Ports: []api.ContainerPort{{ + HostPort: 0, + }, { + HostPort: 1, + }}, + }}, + } + task, err = New(api.NewDefaultContext(), "", *pod, &mesos.ExecutorInfo{}) + if err != nil { + t.Fatal(err) + } + mapping, err = wildcardHostPortMapping(task, offer) + if err, _ := err.(*PortAllocationError); err == nil { + t.Fatal("Expected port allocation error") + } else if len(err.Ports) != 0 { + t.Fatal("Expected port allocation error for wildcard port") + } + + //-- + offer = &mesos.Offer{ + Resources: []*mesos.Resource{ + rangeResource("ports", []uint64{1, 2}), + }, + } + mapping, err = wildcardHostPortMapping(task, offer) + if err != nil { + t.Fatal(err) + } else if len(mapping) != 2 { + t.Fatal("Expected both ports allocated") + } + valid := 0 + for _, entry := range mapping { + if entry.ContainerIdx == 0 && entry.PortIdx == 0 && entry.OfferPort == 2 { + valid++ + } + if entry.ContainerIdx == 0 && entry.PortIdx == 1 && entry.OfferPort == 1 { + valid++ + } + } + if valid < 2 { + t.Fatalf("Expected 2 valid port mappings, not %d", valid) + } +} + +func TestMappingTypeForPod(t *testing.T) { + pod := &api.Pod{ + ObjectMeta: api.ObjectMeta{ + Labels: map[string]string{}, + }, + } + mt := MappingTypeForPod(pod) + if mt != HostPortMappingWildcard { + t.Fatalf("expected wildcard mapping") + } + + pod.Labels[PortMappingLabelKey] = string(HostPortMappingFixed) + mt = MappingTypeForPod(pod) + if mt != HostPortMappingFixed { + t.Fatalf("expected fixed mapping") + } + + pod.Labels[PortMappingLabelKey] = string(HostPortMappingWildcard) + mt = MappingTypeForPod(pod) + if mt != HostPortMappingWildcard { + t.Fatalf("expected wildcard mapping") + } +} diff --git a/contrib/mesos/pkg/scheduler/podtask/protobuf.go b/contrib/mesos/pkg/scheduler/podtask/protobuf.go new file mode 100644 index 00000000000..c8245425549 --- /dev/null +++ b/contrib/mesos/pkg/scheduler/podtask/protobuf.go @@ -0,0 +1,57 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package podtask + +import ( + "github.com/gogo/protobuf/proto" + mesos "github.com/mesos/mesos-go/mesosproto" +) + +// create a range resource for the listed ports +func rangeResource(name string, ports []uint64) *mesos.Resource { + if len(ports) == 0 { + // pod may consist of a container that doesn't expose any ports on the host + return nil + } + return &mesos.Resource{ + Name: proto.String(name), + Type: mesos.Value_RANGES.Enum(), + Ranges: newRanges(ports), + } +} + +// generate port ranges from a list of ports. this implementation is very naive +func newRanges(ports []uint64) *mesos.Value_Ranges { + r := make([]*mesos.Value_Range, 0) + for _, port := range ports { + x := proto.Uint64(port) + r = append(r, &mesos.Value_Range{Begin: x, End: x}) + } + return &mesos.Value_Ranges{Range: r} +} + +func foreachRange(offer *mesos.Offer, resourceName string, f func(begin, end uint64)) { + for _, resource := range offer.Resources { + if resource.GetName() == resourceName { + for _, r := range (*resource).GetRanges().Range { + bp := r.GetBegin() + ep := r.GetEnd() + f(bp, ep) + } + } + } +} diff --git a/contrib/mesos/pkg/scheduler/podtask/registry.go b/contrib/mesos/pkg/scheduler/podtask/registry.go new file mode 100644 index 00000000000..589484fb143 --- /dev/null +++ b/contrib/mesos/pkg/scheduler/podtask/registry.go @@ -0,0 +1,335 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package podtask + +import ( + "container/ring" + "encoding/json" + "fmt" + "sync" + "time" + + "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/metrics" + "github.com/GoogleCloudPlatform/kubernetes/pkg/api" + log "github.com/golang/glog" + mesos "github.com/mesos/mesos-go/mesosproto" +) + +const ( + //TODO(jdef) move this somewhere else + PodPath = "/pods" + + // length of historical record of finished tasks + defaultFinishedTasksSize = 1024 +) + +// state store for pod tasks +type Registry interface { + // register the specified task with this registry, as long as the current error + // condition is nil. if no errors occur then return a copy of the registered task. + Register(*T, error) (*T, error) + + // unregister the specified task from this registry + Unregister(*T) + + // update state for the registered task identified by task.ID, returning a copy of + // the updated task, if any. + Update(task *T) error + + // return the task registered for the specified task ID and its current state. + // if there is no such task then StateUnknown is returned. + Get(taskId string) (task *T, currentState StateType) + + // return the non-terminal task corresponding to the specified pod ID + ForPod(podID string) (task *T, currentState StateType) + + // update the task status given the specified mesos task status update, returning a + // copy of the updated task (if any) and its state. + UpdateStatus(status *mesos.TaskStatus) (*T, StateType) + + // return a list of task ID's that match the given filter, or all task ID's if filter == nil. + List(filter func(*T) bool) []*T +} + +type inMemoryRegistry struct { + rw sync.RWMutex + taskRegistry map[string]*T + tasksFinished *ring.Ring + podToTask map[string]string +} + +func NewInMemoryRegistry() Registry { + return &inMemoryRegistry{ + taskRegistry: make(map[string]*T), + tasksFinished: ring.New(defaultFinishedTasksSize), + podToTask: make(map[string]string), + } +} + +func (k *inMemoryRegistry) List(accepts func(t *T) bool) (tasks []*T) { + k.rw.RLock() + defer k.rw.RUnlock() + for _, task := range k.taskRegistry { + if accepts == nil || accepts(task) { + tasks = append(tasks, task.Clone()) + } + } + return +} + +func (k *inMemoryRegistry) ForPod(podID string) (task *T, currentState StateType) { + k.rw.RLock() + defer k.rw.RUnlock() + tid, ok := k.podToTask[podID] + if !ok { + return nil, StateUnknown + } + t, state := k._get(tid) + return t.Clone(), state +} + +// registers a pod task unless the spec'd error is not nil +func (k *inMemoryRegistry) Register(task *T, err error) (*T, error) { + if err == nil { + k.rw.Lock() + defer k.rw.Unlock() + if _, found := k.podToTask[task.podKey]; found { + return nil, fmt.Errorf("task already registered for pod key %q", task.podKey) + } + if _, found := k.taskRegistry[task.ID]; found { + return nil, fmt.Errorf("task already registered for id %q", task.ID) + } + k.podToTask[task.podKey] = task.ID + k.taskRegistry[task.ID] = task + } + return task.Clone(), err +} + +// updates internal task state. updates are limited to Spec, Flags, and Offer for +// StatePending tasks, and are limited to Flag updates (additive only) for StateRunning tasks. +func (k *inMemoryRegistry) Update(task *T) error { + if task == nil { + return nil + } + k.rw.Lock() + defer k.rw.Unlock() + switch internal, state := k._get(task.ID); state { + case StateUnknown: + return fmt.Errorf("no such task: %v", task.ID) + case StatePending: + internal.Offer = task.Offer + internal.Spec = task.Spec + (&task.Spec).copyTo(&internal.Spec) + internal.Flags = map[FlagType]struct{}{} + fallthrough + case StateRunning: + for k, v := range task.Flags { + internal.Flags[k] = v + } + return nil + default: + return fmt.Errorf("may not update task %v in state %v", task.ID, state) + } +} + +func (k *inMemoryRegistry) Unregister(task *T) { + k.rw.Lock() + defer k.rw.Unlock() + delete(k.podToTask, task.podKey) + delete(k.taskRegistry, task.ID) +} + +func (k *inMemoryRegistry) Get(taskId string) (*T, StateType) { + k.rw.RLock() + defer k.rw.RUnlock() + t, state := k._get(taskId) + return t.Clone(), state +} + +// assume that the caller has already locked around access to task state. +// the caller is also responsible for cloning the task object before it leaves +// the context of this registry. +func (k *inMemoryRegistry) _get(taskId string) (*T, StateType) { + if task, found := k.taskRegistry[taskId]; found { + return task, task.State + } + return nil, StateUnknown +} + +func (k *inMemoryRegistry) UpdateStatus(status *mesos.TaskStatus) (*T, StateType) { + taskId := status.GetTaskId().GetValue() + + k.rw.Lock() + defer k.rw.Unlock() + task, state := k._get(taskId) + + switch status.GetState() { + case mesos.TaskState_TASK_STAGING: + k.handleTaskStaging(task, state, status) + case mesos.TaskState_TASK_STARTING: + k.handleTaskStarting(task, state, status) + case mesos.TaskState_TASK_RUNNING: + k.handleTaskRunning(task, state, status) + case mesos.TaskState_TASK_FINISHED: + k.handleTaskFinished(task, state, status) + case mesos.TaskState_TASK_FAILED: + k.handleTaskFailed(task, state, status) + case mesos.TaskState_TASK_KILLED: + k.handleTaskKilled(task, state, status) + case mesos.TaskState_TASK_LOST: + k.handleTaskLost(task, state, status) + default: + log.Warningf("unhandled status update for task: %v", taskId) + } + return task.Clone(), state +} + +func (k *inMemoryRegistry) handleTaskStaging(task *T, state StateType, status *mesos.TaskStatus) { + if status.GetSource() != mesos.TaskStatus_SOURCE_MASTER { + log.Errorf("received STAGING for task %v with unexpected source: %v", + status.GetTaskId().GetValue(), status.GetSource()) + } +} + +func (k *inMemoryRegistry) handleTaskStarting(task *T, state StateType, status *mesos.TaskStatus) { + // we expect to receive this when a launched task is finally "bound" + // via the API server. however, there's nothing specific for us to do here. + switch state { + case StatePending: + task.UpdatedTime = time.Now() + if !task.Has(Bound) { + task.Set(Bound) + task.bindTime = task.UpdatedTime + timeToBind := task.bindTime.Sub(task.launchTime) + metrics.BindLatency.Observe(metrics.InMicroseconds(timeToBind)) + } + default: + taskId := status.GetTaskId().GetValue() + log.Warningf("Ignore status TASK_STARTING because the task %v is not pending", taskId) + } +} + +func (k *inMemoryRegistry) handleTaskRunning(task *T, state StateType, status *mesos.TaskStatus) { + taskId := status.GetTaskId().GetValue() + switch state { + case StatePending: + task.UpdatedTime = time.Now() + log.Infof("Received running status for pending task: %v", taskId) + fillRunningPodInfo(task, status) + task.State = StateRunning + case StateRunning: + task.UpdatedTime = time.Now() + log.V(2).Infof("Ignore status TASK_RUNNING because the task %v is already running", taskId) + case StateFinished: + log.Warningf("Ignore status TASK_RUNNING because the task %v is already finished", taskId) + default: + log.Warningf("Ignore status TASK_RUNNING because the task %v is discarded", taskId) + } +} + +func ParsePodStatusResult(taskStatus *mesos.TaskStatus) (result api.PodStatusResult, err error) { + if taskStatus.Data != nil { + err = json.Unmarshal(taskStatus.Data, &result) + } else { + err = fmt.Errorf("missing TaskStatus.Data") + } + return +} + +func fillRunningPodInfo(task *T, taskStatus *mesos.TaskStatus) { + if taskStatus.GetReason() == mesos.TaskStatus_REASON_RECONCILIATION && taskStatus.GetSource() == mesos.TaskStatus_SOURCE_MASTER { + // there is no data.. + return + } + //TODO(jdef) determine the usefullness of this information (if any) + if result, err := ParsePodStatusResult(taskStatus); err != nil { + log.Errorf("invalid TaskStatus.Data for task '%v': %v", task.ID, err) + } else { + task.podStatus = result.Status + log.Infof("received pod status for task %v: %+v", task.ID, result.Status) + } +} + +func (k *inMemoryRegistry) handleTaskFinished(task *T, state StateType, status *mesos.TaskStatus) { + taskId := status.GetTaskId().GetValue() + switch state { + case StatePending: + panic(fmt.Sprintf("Pending task %v finished, this couldn't happen", taskId)) + case StateRunning: + log.V(2).Infof("received finished status for running task: %v", taskId) + delete(k.podToTask, task.podKey) + task.State = StateFinished + task.UpdatedTime = time.Now() + k.tasksFinished = k.recordFinishedTask(task.ID) + case StateFinished: + log.Warningf("Ignore status TASK_FINISHED because the task %v is already finished", taskId) + default: + log.Warningf("Ignore status TASK_FINISHED because the task %v is not running", taskId) + } +} + +// record that a task has finished. +// older record are expunged one at a time once the historical ring buffer is saturated. +// assumes caller is holding state lock. +func (k *inMemoryRegistry) recordFinishedTask(taskId string) *ring.Ring { + slot := k.tasksFinished.Next() + if slot.Value != nil { + // garbage collect older finished task from the registry + gctaskId := slot.Value.(string) + if gctask, found := k.taskRegistry[gctaskId]; found && gctask.State == StateFinished { + delete(k.taskRegistry, gctaskId) + } + } + slot.Value = taskId + return slot +} + +func (k *inMemoryRegistry) handleTaskFailed(task *T, state StateType, status *mesos.TaskStatus) { + switch state { + case StatePending: + delete(k.taskRegistry, task.ID) + delete(k.podToTask, task.podKey) + case StateRunning: + delete(k.taskRegistry, task.ID) + delete(k.podToTask, task.podKey) + } +} + +func (k *inMemoryRegistry) handleTaskKilled(task *T, state StateType, status *mesos.TaskStatus) { + defer func() { + msg := fmt.Sprintf("task killed: %+v, task %+v", status, task) + if task != nil && task.Has(Deleted) { + // we were expecting this, nothing out of the ordinary + log.V(2).Infoln(msg) + } else { + log.Errorln(msg) + } + }() + switch state { + case StatePending, StateRunning: + delete(k.taskRegistry, task.ID) + delete(k.podToTask, task.podKey) + } +} + +func (k *inMemoryRegistry) handleTaskLost(task *T, state StateType, status *mesos.TaskStatus) { + switch state { + case StateRunning, StatePending: + delete(k.taskRegistry, task.ID) + delete(k.podToTask, task.podKey) + } +} diff --git a/contrib/mesos/pkg/scheduler/podtask/registry_test.go b/contrib/mesos/pkg/scheduler/podtask/registry_test.go new file mode 100644 index 00000000000..92716b2bb81 --- /dev/null +++ b/contrib/mesos/pkg/scheduler/podtask/registry_test.go @@ -0,0 +1,320 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package podtask + +import ( + "testing" + "time" + + "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/offers" + "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/proc" + mesos "github.com/mesos/mesos-go/mesosproto" + "github.com/mesos/mesos-go/mesosutil" + "github.com/stretchr/testify/assert" +) + +func TestInMemoryRegistry_RegisterGetUnregister(t *testing.T) { + assert := assert.New(t) + + registry := NewInMemoryRegistry() + + // it's empty at the beginning + tasks := registry.List(func(t *T) bool { return true }) + assert.Empty(tasks) + + // add a task + a, _ := fakePodTask("a") + a_clone, err := registry.Register(a, nil) + assert.NoError(err) + assert.Equal(a_clone.ID, a.ID) + assert.Equal(a_clone.podKey, a.podKey) + + // add another task + b, _ := fakePodTask("b") + b_clone, err := registry.Register(b, nil) + assert.NoError(err) + assert.Equal(b_clone.ID, b.ID) + assert.Equal(b_clone.podKey, b.podKey) + + // find tasks in the registry + tasks = registry.List(func(t *T) bool { return true }) + assert.Len(tasks, 2) + assert.Contains(tasks, a_clone) + assert.Contains(tasks, b_clone) + + tasks = registry.List(func(t *T) bool { return t.ID == a.ID }) + assert.Len(tasks, 1) + assert.Contains(tasks, a_clone) + + task, _ := registry.ForPod(a.podKey) + assert.NotNil(task) + assert.Equal(task.ID, a.ID) + + task, _ = registry.ForPod(b.podKey) + assert.NotNil(task) + assert.Equal(task.ID, b.ID) + + task, _ = registry.ForPod("no-pod-key") + assert.Nil(task) + + task, _ = registry.Get(a.ID) + assert.NotNil(task) + assert.Equal(task.ID, a.ID) + + task, _ = registry.Get("unknown-task-id") + assert.Nil(task) + + // re-add a task + a_clone, err = registry.Register(a, nil) + assert.Error(err) + assert.Nil(a_clone) + + // re-add a task with another podKey, but same task id + another_a := a.Clone() + another_a.podKey = "another-pod" + another_a_clone, err := registry.Register(another_a, nil) + assert.Error(err) + assert.Nil(another_a_clone) + + // re-add a task with another task ID, but same podKey + another_b := b.Clone() + another_b.ID = "another-task-id" + another_b_clone, err := registry.Register(another_b, nil) + assert.Error(err) + assert.Nil(another_b_clone) + + // unregister a task + registry.Unregister(b) + + tasks = registry.List(func(t *T) bool { return true }) + assert.Len(tasks, 1) + assert.Contains(tasks, a) + + // unregister a task not registered + unregistered_task, _ := fakePodTask("unregistered-task") + registry.Unregister(unregistered_task) +} + +func fakeStatusUpdate(taskId string, state mesos.TaskState) *mesos.TaskStatus { + status := mesosutil.NewTaskStatus(mesosutil.NewTaskID(taskId), state) + status.Data = []byte("{}") // empty json + masterSource := mesos.TaskStatus_SOURCE_MASTER + status.Source = &masterSource + return status +} + +func TestInMemoryRegistry_State(t *testing.T) { + assert := assert.New(t) + + registry := NewInMemoryRegistry() + + // add a task + a, _ := fakePodTask("a") + a_clone, err := registry.Register(a, nil) + assert.NoError(err) + assert.Equal(a.State, a_clone.State) + + // update the status + assert.Equal(a_clone.State, StatePending) + a_clone, state := registry.UpdateStatus(fakeStatusUpdate(a.ID, mesos.TaskState_TASK_RUNNING)) + assert.Equal(state, StatePending) // old state + assert.Equal(a_clone.State, StateRunning) // new state + + // update unknown task + unknown_clone, state := registry.UpdateStatus(fakeStatusUpdate("unknown-task-id", mesos.TaskState_TASK_RUNNING)) + assert.Nil(unknown_clone) + assert.Equal(state, StateUnknown) +} + +func TestInMemoryRegistry_Update(t *testing.T) { + assert := assert.New(t) + + // create offers registry + ttl := time.Second / 4 + config := offers.RegistryConfig{ + DeclineOffer: func(offerId string) <-chan error { + return proc.ErrorChan(nil) + }, + Compat: func(o *mesos.Offer) bool { + return true + }, + TTL: ttl, + LingerTTL: 2 * ttl, + } + storage := offers.CreateRegistry(config) + + // Add offer + offerId := mesosutil.NewOfferID("foo") + mesosOffer := &mesos.Offer{Id: offerId} + storage.Add([]*mesos.Offer{mesosOffer}) + offer, ok := storage.Get(offerId.GetValue()) + assert.True(ok) + + // create registry + registry := NewInMemoryRegistry() + a, _ := fakePodTask("a") + registry.Register(a.Clone(), nil) // here clone a because we change it below + + // state changes are ignored + a.State = StateRunning + err := registry.Update(a) + assert.NoError(err) + a_clone, _ := registry.Get(a.ID) + assert.Equal(StatePending, a_clone.State) + + // offer is updated while pending + a.Offer = offer + err = registry.Update(a) + assert.NoError(err) + a_clone, _ = registry.Get(a.ID) + assert.Equal(offer.Id(), a_clone.Offer.Id()) + + // spec is updated while pending + a.Spec = Spec{SlaveID: "slave-1"} + err = registry.Update(a) + assert.NoError(err) + a_clone, _ = registry.Get(a.ID) + assert.Equal("slave-1", a_clone.Spec.SlaveID) + + // flags are updated while pending + a.Flags[Launched] = struct{}{} + err = registry.Update(a) + assert.NoError(err) + a_clone, _ = registry.Get(a.ID) + + _, found_launched := a_clone.Flags[Launched] + assert.True(found_launched) + + // flags are updated while running + registry.UpdateStatus(fakeStatusUpdate(a.ID, mesos.TaskState_TASK_RUNNING)) + a.Flags[Bound] = struct{}{} + err = registry.Update(a) + assert.NoError(err) + a_clone, _ = registry.Get(a.ID) + + _, found_launched = a_clone.Flags[Launched] + assert.True(found_launched) + _, found_bound := a_clone.Flags[Bound] + assert.True(found_bound) + + // spec is ignored while running + a.Spec = Spec{SlaveID: "slave-2"} + err = registry.Update(a) + assert.NoError(err) + a_clone, _ = registry.Get(a.ID) + assert.Equal("slave-1", a_clone.Spec.SlaveID) + + // error when finished + registry.UpdateStatus(fakeStatusUpdate(a.ID, mesos.TaskState_TASK_FINISHED)) + err = registry.Update(a) + assert.Error(err) + + // update unknown task + unknown_task, _ := fakePodTask("unknown-task") + err = registry.Update(unknown_task) + assert.Error(err) + + // update nil task + err = registry.Update(nil) + assert.Nil(err) +} + +type transition struct { + statusUpdate mesos.TaskState + expectedState *StateType + expectPanic bool +} + +func NewTransition(statusUpdate mesos.TaskState, expectedState StateType) transition { + return transition{statusUpdate: statusUpdate, expectedState: &expectedState, expectPanic: false} +} + +func NewTransitionToDeletedTask(statusUpdate mesos.TaskState) transition { + return transition{statusUpdate: statusUpdate, expectedState: nil, expectPanic: false} +} + +func NewTransitionWhichPanics(statusUpdate mesos.TaskState) transition { + return transition{statusUpdate: statusUpdate, expectPanic: true} +} + +func testStateTrace(t *testing.T, transitions []transition) *Registry { + assert := assert.New(t) + + registry := NewInMemoryRegistry() + a, _ := fakePodTask("a") + a, _ = registry.Register(a, nil) + + // initial pending state + assert.Equal(a.State, StatePending) + + for _, transition := range transitions { + if transition.expectPanic { + assert.Panics(func() { + registry.UpdateStatus(fakeStatusUpdate(a.ID, transition.statusUpdate)) + }) + } else { + a, _ = registry.UpdateStatus(fakeStatusUpdate(a.ID, transition.statusUpdate)) + if transition.expectedState == nil { + a, _ = registry.Get(a.ID) + assert.Nil(a, "expected task to be deleted from registry after status update to %v", transition.statusUpdate) + } else { + assert.Equal(a.State, *transition.expectedState) + } + } + } + + return ®istry +} + +func TestInMemoryRegistry_TaskLifeCycle(t *testing.T) { + testStateTrace(t, []transition{ + NewTransition(mesos.TaskState_TASK_STAGING, StatePending), + NewTransition(mesos.TaskState_TASK_STARTING, StatePending), + NewTransitionWhichPanics(mesos.TaskState_TASK_FINISHED), + NewTransition(mesos.TaskState_TASK_RUNNING, StateRunning), + NewTransition(mesos.TaskState_TASK_RUNNING, StateRunning), + NewTransition(mesos.TaskState_TASK_STARTING, StateRunning), + NewTransition(mesos.TaskState_TASK_FINISHED, StateFinished), + NewTransition(mesos.TaskState_TASK_FINISHED, StateFinished), + NewTransition(mesos.TaskState_TASK_RUNNING, StateFinished), + }) +} + +func TestInMemoryRegistry_NotFinished(t *testing.T) { + // all these behave the same + notFinishedStates := []mesos.TaskState{ + mesos.TaskState_TASK_FAILED, + mesos.TaskState_TASK_KILLED, + mesos.TaskState_TASK_LOST, + } + for _, notFinishedState := range notFinishedStates { + testStateTrace(t, []transition{ + NewTransitionToDeletedTask(notFinishedState), + }) + + testStateTrace(t, []transition{ + NewTransition(mesos.TaskState_TASK_RUNNING, StateRunning), + NewTransitionToDeletedTask(notFinishedState), + }) + + testStateTrace(t, []transition{ + NewTransition(mesos.TaskState_TASK_RUNNING, StateRunning), + NewTransition(mesos.TaskState_TASK_FINISHED, StateFinished), + NewTransition(notFinishedState, StateFinished), + }) + } +} diff --git a/contrib/mesos/pkg/scheduler/scheduler.go b/contrib/mesos/pkg/scheduler/scheduler.go new file mode 100644 index 00000000000..f6633cdacf6 --- /dev/null +++ b/contrib/mesos/pkg/scheduler/scheduler.go @@ -0,0 +1,924 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package scheduler + +import ( + "fmt" + "io" + "math" + "net/http" + "reflect" + "sync" + "time" + + execcfg "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/executor/config" + "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/executor/messages" + "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/offers" + offerMetrics "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/offers/metrics" + "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/proc" + "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/runtime" + schedcfg "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/config" + "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/meta" + "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/metrics" + "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/podtask" + "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/uid" + "github.com/GoogleCloudPlatform/kubernetes/pkg/api" + "github.com/GoogleCloudPlatform/kubernetes/pkg/api/errors" + "github.com/GoogleCloudPlatform/kubernetes/pkg/client" + "github.com/GoogleCloudPlatform/kubernetes/pkg/fields" + "github.com/GoogleCloudPlatform/kubernetes/pkg/kubelet/container" + "github.com/GoogleCloudPlatform/kubernetes/pkg/labels" + "github.com/GoogleCloudPlatform/kubernetes/pkg/tools" + "github.com/GoogleCloudPlatform/kubernetes/pkg/util" + log "github.com/golang/glog" + mesos "github.com/mesos/mesos-go/mesosproto" + mutil "github.com/mesos/mesos-go/mesosutil" + bindings "github.com/mesos/mesos-go/scheduler" +) + +type Slave struct { + HostName string +} + +func newSlave(hostName string) *Slave { + return &Slave{ + HostName: hostName, + } +} + +type slaveStorage struct { + sync.Mutex + slaves map[string]*Slave // SlaveID => slave. +} + +func newSlaveStorage() *slaveStorage { + return &slaveStorage{ + slaves: make(map[string]*Slave), + } +} + +// Create a mapping between a slaveID and slave if not existing. +func (self *slaveStorage) checkAndAdd(slaveId, slaveHostname string) { + self.Lock() + defer self.Unlock() + _, exists := self.slaves[slaveId] + if !exists { + self.slaves[slaveId] = newSlave(slaveHostname) + } +} + +func (self *slaveStorage) getSlaveIds() []string { + self.Lock() + defer self.Unlock() + slaveIds := make([]string, 0, len(self.slaves)) + for slaveID := range self.slaves { + slaveIds = append(slaveIds, slaveID) + } + return slaveIds +} + +func (self *slaveStorage) getSlave(slaveId string) (*Slave, bool) { + self.Lock() + defer self.Unlock() + slave, exists := self.slaves[slaveId] + return slave, exists +} + +type PluginInterface interface { + // the apiserver may have a different state for the pod than we do + // so reconcile our records, but only for this one pod + reconcilePod(api.Pod) + + // execute the Scheduling plugin, should start a go routine and return immediately + Run(<-chan struct{}) +} + +// KubernetesScheduler implements: +// 1: A mesos scheduler. +// 2: A kubernetes scheduler plugin. +// 3: A kubernetes pod.Registry. +type KubernetesScheduler struct { + // We use a lock here to avoid races + // between invoking the mesos callback + // and the invoking the pod registry interfaces. + // In particular, changes to podtask.T objects are currently guarded by this lock. + *sync.RWMutex + + // Config related, write-once + + schedcfg *schedcfg.Config + executor *mesos.ExecutorInfo + executorGroup uint64 + scheduleFunc PodScheduleFunc + client *client.Client + etcdClient tools.EtcdGetSet + failoverTimeout float64 // in seconds + reconcileInterval int64 + + // Mesos context. + + driver bindings.SchedulerDriver // late initialization + frameworkId *mesos.FrameworkID + masterInfo *mesos.MasterInfo + registered bool + registration chan struct{} // signal chan that closes upon first successful registration + onRegistration sync.Once + offers offers.Registry + slaves *slaveStorage + + // unsafe state, needs to be guarded + + taskRegistry podtask.Registry + + // via deferred init + + plugin PluginInterface + reconciler *Reconciler + reconcileCooldown time.Duration + asRegisteredMaster proc.Doer + terminate <-chan struct{} // signal chan, closes when we should kill background tasks +} + +type Config struct { + Schedcfg schedcfg.Config + Executor *mesos.ExecutorInfo + ScheduleFunc PodScheduleFunc + Client *client.Client + EtcdClient tools.EtcdGetSet + FailoverTimeout float64 + ReconcileInterval int64 + ReconcileCooldown time.Duration +} + +// New creates a new KubernetesScheduler +func New(config Config) *KubernetesScheduler { + var k *KubernetesScheduler + k = &KubernetesScheduler{ + schedcfg: &config.Schedcfg, + RWMutex: new(sync.RWMutex), + executor: config.Executor, + executorGroup: uid.Parse(config.Executor.ExecutorId.GetValue()).Group(), + scheduleFunc: config.ScheduleFunc, + client: config.Client, + etcdClient: config.EtcdClient, + failoverTimeout: config.FailoverTimeout, + reconcileInterval: config.ReconcileInterval, + offers: offers.CreateRegistry(offers.RegistryConfig{ + Compat: func(o *mesos.Offer) bool { + // filter the offers: the executor IDs must not identify a kubelet- + // executor with a group that doesn't match ours + for _, eid := range o.GetExecutorIds() { + execuid := uid.Parse(eid.GetValue()) + if execuid.Name() == execcfg.DefaultInfoID && execuid.Group() != k.executorGroup { + return false + } + } + return true + }, + DeclineOffer: func(id string) <-chan error { + errOnce := proc.NewErrorOnce(k.terminate) + errOuter := k.asRegisteredMaster.Do(func() { + var err error + defer errOnce.Report(err) + offerId := mutil.NewOfferID(id) + filters := &mesos.Filters{} + _, err = k.driver.DeclineOffer(offerId, filters) + }) + return errOnce.Send(errOuter).Err() + }, + // remember expired offers so that we can tell if a previously scheduler offer relies on one + LingerTTL: config.Schedcfg.OfferLingerTTL.Duration, + TTL: config.Schedcfg.OfferTTL.Duration, + ListenerDelay: config.Schedcfg.ListenerDelay.Duration, + }), + slaves: newSlaveStorage(), + taskRegistry: podtask.NewInMemoryRegistry(), + reconcileCooldown: config.ReconcileCooldown, + registration: make(chan struct{}), + asRegisteredMaster: proc.DoerFunc(func(proc.Action) <-chan error { + return proc.ErrorChanf("cannot execute action with unregistered scheduler") + }), + } + return k +} + +func (k *KubernetesScheduler) Init(electedMaster proc.Process, pl PluginInterface, mux *http.ServeMux) error { + log.V(1).Infoln("initializing kubernetes mesos scheduler") + + k.asRegisteredMaster = proc.DoerFunc(func(a proc.Action) <-chan error { + if !k.registered { + return proc.ErrorChanf("failed to execute action, scheduler is disconnected") + } + return electedMaster.Do(a) + }) + k.terminate = electedMaster.Done() + k.plugin = pl + k.offers.Init(k.terminate) + k.InstallDebugHandlers(mux) + return k.recoverTasks() +} + +func (k *KubernetesScheduler) asMaster() proc.Doer { + k.RLock() + defer k.RUnlock() + return k.asRegisteredMaster +} + +func (k *KubernetesScheduler) InstallDebugHandlers(mux *http.ServeMux) { + wrappedHandler := func(uri string, h http.Handler) { + mux.HandleFunc(uri, func(w http.ResponseWriter, r *http.Request) { + ch := make(chan struct{}) + closer := runtime.Closer(ch) + proc.OnError(k.asMaster().Do(func() { + defer closer() + h.ServeHTTP(w, r) + }), func(err error) { + defer closer() + log.Warningf("failed HTTP request for %s: %v", uri, err) + w.WriteHeader(http.StatusServiceUnavailable) + }, k.terminate) + select { + case <-time.After(k.schedcfg.HttpHandlerTimeout.Duration): + log.Warningf("timed out waiting for request to be processed") + w.WriteHeader(http.StatusServiceUnavailable) + return + case <-ch: // noop + } + }) + } + requestReconciliation := func(uri string, requestAction func()) { + wrappedHandler(uri, http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + requestAction() + w.WriteHeader(http.StatusNoContent) + })) + } + requestReconciliation("/debug/actions/requestExplicit", k.reconciler.RequestExplicit) + requestReconciliation("/debug/actions/requestImplicit", k.reconciler.RequestImplicit) + + wrappedHandler("/debug/actions/kamikaze", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + slaves := k.slaves.getSlaveIds() + for _, slaveId := range slaves { + _, err := k.driver.SendFrameworkMessage( + k.executor.ExecutorId, + mutil.NewSlaveID(slaveId), + messages.Kamikaze) + if err != nil { + log.Warningf("failed to send kamikaze message to slave %s: %v", slaveId, err) + } else { + io.WriteString(w, fmt.Sprintf("kamikaze slave %s\n", slaveId)) + } + } + io.WriteString(w, "OK") + })) +} + +func (k *KubernetesScheduler) Registration() <-chan struct{} { + return k.registration +} + +// Registered is called when the scheduler registered with the master successfully. +func (k *KubernetesScheduler) Registered(drv bindings.SchedulerDriver, fid *mesos.FrameworkID, mi *mesos.MasterInfo) { + log.Infof("Scheduler registered with the master: %v with frameworkId: %v\n", mi, fid) + + k.driver = drv + k.frameworkId = fid + k.masterInfo = mi + k.registered = true + + k.onRegistration.Do(func() { k.onInitialRegistration(drv) }) + k.reconciler.RequestExplicit() +} + +func (k *KubernetesScheduler) storeFrameworkId() { + // TODO(jdef): port FrameworkId store to generic Kubernetes config store as soon as available + _, err := k.etcdClient.Set(meta.FrameworkIDKey, k.frameworkId.GetValue(), uint64(k.failoverTimeout)) + if err != nil { + log.Errorf("failed to renew frameworkId TTL: %v", err) + } +} + +// Reregistered is called when the scheduler re-registered with the master successfully. +// This happends when the master fails over. +func (k *KubernetesScheduler) Reregistered(drv bindings.SchedulerDriver, mi *mesos.MasterInfo) { + log.Infof("Scheduler reregistered with the master: %v\n", mi) + + k.driver = drv + k.masterInfo = mi + k.registered = true + + k.onRegistration.Do(func() { k.onInitialRegistration(drv) }) + k.reconciler.RequestExplicit() +} + +// perform one-time initialization actions upon the first registration event received from Mesos. +func (k *KubernetesScheduler) onInitialRegistration(driver bindings.SchedulerDriver) { + defer close(k.registration) + + if k.failoverTimeout > 0 { + refreshInterval := k.schedcfg.FrameworkIdRefreshInterval.Duration + if k.failoverTimeout < k.schedcfg.FrameworkIdRefreshInterval.Duration.Seconds() { + refreshInterval = time.Duration(math.Max(1, k.failoverTimeout/2)) * time.Second + } + go runtime.Until(k.storeFrameworkId, refreshInterval, k.terminate) + } + + r1 := k.makeTaskRegistryReconciler() + r2 := k.makePodRegistryReconciler() + + k.reconciler = newReconciler(k.asRegisteredMaster, k.makeCompositeReconciler(r1, r2), + k.reconcileCooldown, k.schedcfg.ExplicitReconciliationAbortTimeout.Duration, k.terminate) + go k.reconciler.Run(driver) + + if k.reconcileInterval > 0 { + ri := time.Duration(k.reconcileInterval) * time.Second + time.AfterFunc(k.schedcfg.InitialImplicitReconciliationDelay.Duration, func() { runtime.Until(k.reconciler.RequestImplicit, ri, k.terminate) }) + log.Infof("will perform implicit task reconciliation at interval: %v after %v", ri, k.schedcfg.InitialImplicitReconciliationDelay.Duration) + } +} + +// Disconnected is called when the scheduler loses connection to the master. +func (k *KubernetesScheduler) Disconnected(driver bindings.SchedulerDriver) { + log.Infof("Master disconnected!\n") + + k.registered = false + + // discard all cached offers to avoid unnecessary TASK_LOST updates + k.offers.Invalidate("") +} + +// ResourceOffers is called when the scheduler receives some offers from the master. +func (k *KubernetesScheduler) ResourceOffers(driver bindings.SchedulerDriver, offers []*mesos.Offer) { + log.V(2).Infof("Received offers %+v", offers) + + // Record the offers in the global offer map as well as each slave's offer map. + k.offers.Add(offers) + for _, offer := range offers { + slaveId := offer.GetSlaveId().GetValue() + k.slaves.checkAndAdd(slaveId, offer.GetHostname()) + } +} + +// OfferRescinded is called when the resources are recinded from the scheduler. +func (k *KubernetesScheduler) OfferRescinded(driver bindings.SchedulerDriver, offerId *mesos.OfferID) { + log.Infof("Offer rescinded %v\n", offerId) + + oid := offerId.GetValue() + k.offers.Delete(oid, offerMetrics.OfferRescinded) +} + +// StatusUpdate is called when a status update message is sent to the scheduler. +func (k *KubernetesScheduler) StatusUpdate(driver bindings.SchedulerDriver, taskStatus *mesos.TaskStatus) { + + source, reason := "none", "none" + if taskStatus.Source != nil { + source = (*taskStatus.Source).String() + } + if taskStatus.Reason != nil { + reason = (*taskStatus.Reason).String() + } + taskState := taskStatus.GetState() + metrics.StatusUpdates.WithLabelValues(source, reason, taskState.String()).Inc() + + log.Infof( + "task status update %q from %q for task %q on slave %q executor %q for reason %q", + taskState.String(), + source, + taskStatus.TaskId.GetValue(), + taskStatus.SlaveId.GetValue(), + taskStatus.ExecutorId.GetValue(), + reason) + + switch taskState { + case mesos.TaskState_TASK_RUNNING, mesos.TaskState_TASK_FINISHED, mesos.TaskState_TASK_STARTING, mesos.TaskState_TASK_STAGING: + if _, state := k.taskRegistry.UpdateStatus(taskStatus); state == podtask.StateUnknown { + if taskState != mesos.TaskState_TASK_FINISHED { + //TODO(jdef) what if I receive this after a TASK_LOST or TASK_KILLED? + //I don't want to reincarnate then.. TASK_LOST is a special case because + //the master is stateless and there are scenarios where I may get TASK_LOST + //followed by TASK_RUNNING. + //TODO(jdef) consider running this asynchronously since there are API server + //calls that may be made + k.reconcileNonTerminalTask(driver, taskStatus) + } // else, we don't really care about FINISHED tasks that aren't registered + return + } + if _, exists := k.slaves.getSlave(taskStatus.GetSlaveId().GetValue()); !exists { + // a registered task has an update reported by a slave that we don't recognize. + // this should never happen! So we don't reconcile it. + log.Errorf("Ignore status %+v because the slave does not exist", taskStatus) + return + } + case mesos.TaskState_TASK_FAILED: + if task, _ := k.taskRegistry.UpdateStatus(taskStatus); task != nil { + if task.Has(podtask.Launched) && !task.Has(podtask.Bound) { + go k.plugin.reconcilePod(task.Pod) + return + } + } else { + // unknown task failed, not much we can do about it + return + } + // last-ditch effort to reconcile our records + fallthrough + case mesos.TaskState_TASK_LOST, mesos.TaskState_TASK_KILLED: + k.reconcileTerminalTask(driver, taskStatus) + } +} + +func (k *KubernetesScheduler) reconcileTerminalTask(driver bindings.SchedulerDriver, taskStatus *mesos.TaskStatus) { + task, state := k.taskRegistry.UpdateStatus(taskStatus) + + if (state == podtask.StateRunning || state == podtask.StatePending) && taskStatus.SlaveId != nil && + ((taskStatus.GetSource() == mesos.TaskStatus_SOURCE_MASTER && taskStatus.GetReason() == mesos.TaskStatus_REASON_RECONCILIATION) || + (taskStatus.GetSource() == mesos.TaskStatus_SOURCE_SLAVE && taskStatus.GetReason() == mesos.TaskStatus_REASON_EXECUTOR_TERMINATED) || + (taskStatus.GetSource() == mesos.TaskStatus_SOURCE_SLAVE && taskStatus.GetReason() == mesos.TaskStatus_REASON_EXECUTOR_UNREGISTERED)) { + //-- + // pod-task has metadata that refers to: + // (1) a task that Mesos no longer knows about, or else + // (2) a pod that the Kubelet will never report as "failed" + // For now, destroy the pod and hope that there's a replication controller backing it up. + // TODO(jdef) for case #2 don't delete the pod, just update it's status to Failed + pod := &task.Pod + log.Warningf("deleting rogue pod %v/%v for lost task %v", pod.Namespace, pod.Name, task.ID) + if err := k.client.Pods(pod.Namespace).Delete(pod.Name, nil); err != nil && !errors.IsNotFound(err) { + log.Errorf("failed to delete pod %v/%v for terminal task %v: %v", pod.Namespace, pod.Name, task.ID, err) + } + } else if taskStatus.GetReason() == mesos.TaskStatus_REASON_EXECUTOR_TERMINATED || taskStatus.GetReason() == mesos.TaskStatus_REASON_EXECUTOR_UNREGISTERED { + // attempt to prevent dangling pods in the pod and task registries + log.V(1).Infof("request explicit reconciliation to clean up for task %v after executor reported (terminated/unregistered)", taskStatus.TaskId.GetValue()) + k.reconciler.RequestExplicit() + } else if taskStatus.GetState() == mesos.TaskState_TASK_LOST && state == podtask.StateRunning && taskStatus.ExecutorId != nil && taskStatus.SlaveId != nil { + //TODO(jdef) this may not be meaningful once we have proper checkpointing and master detection + //If we're reconciling and receive this then the executor may be + //running a task that we need it to kill. It's possible that the framework + //is unrecognized by the master at this point, so KillTask is not guaranteed + //to do anything. The underlying driver transport may be able to send a + //FrameworkMessage directly to the slave to terminate the task. + log.V(2).Info("forwarding TASK_LOST message to executor %v on slave %v", taskStatus.ExecutorId, taskStatus.SlaveId) + data := fmt.Sprintf("task-lost:%s", task.ID) //TODO(jdef) use a real message type + if _, err := driver.SendFrameworkMessage(taskStatus.ExecutorId, taskStatus.SlaveId, data); err != nil { + log.Error(err.Error()) + } + } +} + +// reconcile an unknown (from the perspective of our registry) non-terminal task +func (k *KubernetesScheduler) reconcileNonTerminalTask(driver bindings.SchedulerDriver, taskStatus *mesos.TaskStatus) { + // attempt to recover task from pod info: + // - task data may contain an api.PodStatusResult; if status.reason == REASON_RECONCILIATION then status.data == nil + // - the Name can be parsed by container.ParseFullName() to yield a pod Name and Namespace + // - pull the pod metadata down from the api server + // - perform task recovery based on pod metadata + taskId := taskStatus.TaskId.GetValue() + if taskStatus.GetReason() == mesos.TaskStatus_REASON_RECONCILIATION && taskStatus.GetSource() == mesos.TaskStatus_SOURCE_MASTER { + // there will be no data in the task status that we can use to determine the associated pod + switch taskStatus.GetState() { + case mesos.TaskState_TASK_STAGING: + // there is still hope for this task, don't kill it just yet + //TODO(jdef) there should probably be a limit for how long we tolerate tasks stuck in this state + return + default: + // for TASK_{STARTING,RUNNING} we should have already attempted to recoverTasks() for. + // if the scheduler failed over before the executor fired TASK_STARTING, then we should *not* + // be processing this reconciliation update before we process the one from the executor. + // point: we don't know what this task is (perhaps there was unrecoverable metadata in the pod), + // so it gets killed. + log.Errorf("killing non-terminal, unrecoverable task %v", taskId) + } + } else if podStatus, err := podtask.ParsePodStatusResult(taskStatus); err != nil { + // possible rogue pod exists at this point because we can't identify it; should kill the task + log.Errorf("possible rogue pod; illegal task status data for task %v, expected an api.PodStatusResult: %v", taskId, err) + } else if name, namespace, err := container.ParsePodFullName(podStatus.Name); err != nil { + // possible rogue pod exists at this point because we can't identify it; should kill the task + log.Errorf("possible rogue pod; illegal api.PodStatusResult, unable to parse full pod name from: '%v' for task %v: %v", + podStatus.Name, taskId, err) + } else if pod, err := k.client.Pods(namespace).Get(name); err == nil { + if t, ok, err := podtask.RecoverFrom(*pod); ok { + log.Infof("recovered task %v from metadata in pod %v/%v", taskId, namespace, name) + _, err := k.taskRegistry.Register(t, nil) + if err != nil { + // someone beat us to it?! + log.Warningf("failed to register recovered task: %v", err) + return + } else { + k.taskRegistry.UpdateStatus(taskStatus) + } + return + } else if err != nil { + //should kill the pod and the task + log.Errorf("killing pod, failed to recover task from pod %v/%v: %v", namespace, name, err) + if err := k.client.Pods(namespace).Delete(name, nil); err != nil { + log.Errorf("failed to delete pod %v/%v: %v", namespace, name, err) + } + } else { + //this is pretty unexpected: we received a TASK_{STARTING,RUNNING} message, but the apiserver's pod + //metadata is not appropriate for task reconstruction -- which should almost certainly never + //be the case unless someone swapped out the pod on us (and kept the same namespace/name) while + //we were failed over. + + //kill this task, allow the newly launched scheduler to schedule the new pod + log.Warningf("unexpected pod metadata for task %v in apiserver, assuming new unscheduled pod spec: %+v", taskId, pod) + } + } else if errors.IsNotFound(err) { + // pod lookup failed, should delete the task since the pod is no longer valid; may be redundant, that's ok + log.Infof("killing task %v since pod %v/%v no longer exists", taskId, namespace, name) + } else if errors.IsServerTimeout(err) { + log.V(2).Infof("failed to reconcile task due to API server timeout: %v", err) + return + } else { + log.Errorf("unexpected API server error, aborting reconcile for task %v: %v", taskId, err) + return + } + if _, err := driver.KillTask(taskStatus.TaskId); err != nil { + log.Errorf("failed to kill task %v: %v", taskId, err) + } +} + +// FrameworkMessage is called when the scheduler receives a message from the executor. +func (k *KubernetesScheduler) FrameworkMessage(driver bindings.SchedulerDriver, + executorId *mesos.ExecutorID, slaveId *mesos.SlaveID, message string) { + log.Infof("Received messages from executor %v of slave %v, %v\n", executorId, slaveId, message) +} + +// SlaveLost is called when some slave is lost. +func (k *KubernetesScheduler) SlaveLost(driver bindings.SchedulerDriver, slaveId *mesos.SlaveID) { + log.Infof("Slave %v is lost\n", slaveId) + + sid := slaveId.GetValue() + k.offers.InvalidateForSlave(sid) + + // TODO(jdef): delete slave from our internal list? probably not since we may need to reconcile + // tasks. it would be nice to somehow flag the slave as lost so that, perhaps, we can periodically + // flush lost slaves older than X, and for which no tasks or pods reference. + + // unfinished tasks/pods will be dropped. use a replication controller if you want pods to + // be restarted when slaves die. +} + +// ExecutorLost is called when some executor is lost. +func (k *KubernetesScheduler) ExecutorLost(driver bindings.SchedulerDriver, executorId *mesos.ExecutorID, slaveId *mesos.SlaveID, status int) { + log.Infof("Executor %v of slave %v is lost, status: %v\n", executorId, slaveId, status) + // TODO(yifan): Restart any unfinished tasks of the executor. +} + +// Error is called when there is an unrecoverable error in the scheduler or scheduler driver. +// The driver should have been aborted before this is invoked. +func (k *KubernetesScheduler) Error(driver bindings.SchedulerDriver, message string) { + log.Fatalf("fatal scheduler error: %v\n", message) +} + +// filter func used for explicit task reconciliation, selects only non-terminal tasks which +// have been communicated to mesos (read: launched). +func explicitTaskFilter(t *podtask.T) bool { + switch t.State { + case podtask.StateRunning: + return true + case podtask.StatePending: + return t.Has(podtask.Launched) + default: + return false + } +} + +// invoke the given ReconcilerAction funcs in sequence, aborting the sequence if reconciliation +// is cancelled. if any other errors occur the composite reconciler will attempt to complete the +// sequence, reporting only the last generated error. +func (k *KubernetesScheduler) makeCompositeReconciler(actions ...ReconcilerAction) ReconcilerAction { + if x := len(actions); x == 0 { + // programming error + panic("no actions specified for composite reconciler") + } else if x == 1 { + return actions[0] + } + chained := func(d bindings.SchedulerDriver, c <-chan struct{}, a, b ReconcilerAction) <-chan error { + ech := a(d, c) + ch := make(chan error, 1) + go func() { + select { + case <-k.terminate: + case <-c: + case e := <-ech: + if e != nil { + ch <- e + return + } + ech = b(d, c) + select { + case <-k.terminate: + case <-c: + case e := <-ech: + if e != nil { + ch <- e + return + } + close(ch) + return + } + } + ch <- fmt.Errorf("aborting composite reconciler action") + }() + return ch + } + result := func(d bindings.SchedulerDriver, c <-chan struct{}) <-chan error { + return chained(d, c, actions[0], actions[1]) + } + for i := 2; i < len(actions); i++ { + i := i + next := func(d bindings.SchedulerDriver, c <-chan struct{}) <-chan error { + return chained(d, c, ReconcilerAction(result), actions[i]) + } + result = next + } + return ReconcilerAction(result) +} + +// reconciler action factory, performs explicit task reconciliation for non-terminal +// tasks listed in the scheduler's internal taskRegistry. +func (k *KubernetesScheduler) makeTaskRegistryReconciler() ReconcilerAction { + return ReconcilerAction(func(drv bindings.SchedulerDriver, cancel <-chan struct{}) <-chan error { + taskToSlave := make(map[string]string) + for _, t := range k.taskRegistry.List(explicitTaskFilter) { + if t.Spec.SlaveID != "" { + taskToSlave[t.ID] = t.Spec.SlaveID + } + } + return proc.ErrorChan(k.explicitlyReconcileTasks(drv, taskToSlave, cancel)) + }) +} + +// reconciler action factory, performs explicit task reconciliation for non-terminal +// tasks identified by annotations in the Kubernetes pod registry. +func (k *KubernetesScheduler) makePodRegistryReconciler() ReconcilerAction { + return ReconcilerAction(func(drv bindings.SchedulerDriver, cancel <-chan struct{}) <-chan error { + ctx := api.NewDefaultContext() + podList, err := k.client.Pods(api.NamespaceValue(ctx)).List(labels.Everything(), fields.Everything()) + if err != nil { + return proc.ErrorChanf("failed to reconcile pod registry: %v", err) + } + taskToSlave := make(map[string]string) + for _, pod := range podList.Items { + if len(pod.Annotations) == 0 { + continue + } + taskId, found := pod.Annotations[meta.TaskIdKey] + if !found { + continue + } + slaveId, found := pod.Annotations[meta.SlaveIdKey] + if !found { + continue + } + taskToSlave[taskId] = slaveId + } + return proc.ErrorChan(k.explicitlyReconcileTasks(drv, taskToSlave, cancel)) + }) +} + +// execute an explicit task reconciliation, as per http://mesos.apache.org/documentation/latest/reconciliation/ +func (k *KubernetesScheduler) explicitlyReconcileTasks(driver bindings.SchedulerDriver, taskToSlave map[string]string, cancel <-chan struct{}) error { + log.Info("explicit reconcile tasks") + + // tell mesos to send us the latest status updates for all the non-terminal tasks that we know about + statusList := []*mesos.TaskStatus{} + remaining := util.KeySet(reflect.ValueOf(taskToSlave)) + for taskId, slaveId := range taskToSlave { + if slaveId == "" { + delete(taskToSlave, taskId) + continue + } + statusList = append(statusList, &mesos.TaskStatus{ + TaskId: mutil.NewTaskID(taskId), + SlaveId: mutil.NewSlaveID(slaveId), + State: mesos.TaskState_TASK_RUNNING.Enum(), // req'd field, doesn't have to reflect reality + }) + } + + select { + case <-cancel: + return reconciliationCancelledErr + default: + if _, err := driver.ReconcileTasks(statusList); err != nil { + return err + } + } + + start := time.Now() + first := true + for backoff := 1 * time.Second; first || remaining.Len() > 0; backoff = backoff * 2 { + first = false + // nothing to do here other than wait for status updates.. + if backoff > k.schedcfg.ExplicitReconciliationMaxBackoff.Duration { + backoff = k.schedcfg.ExplicitReconciliationMaxBackoff.Duration + } + select { + case <-cancel: + return reconciliationCancelledErr + case <-time.After(backoff): + for taskId := range remaining { + if task, _ := k.taskRegistry.Get(taskId); task != nil && explicitTaskFilter(task) && task.UpdatedTime.Before(start) { + // keep this task in remaining list + continue + } + remaining.Delete(taskId) + } + } + } + return nil +} + +var ( + reconciliationCancelledErr = fmt.Errorf("explicit task reconciliation cancelled") +) + +type ReconcilerAction func(driver bindings.SchedulerDriver, cancel <-chan struct{}) <-chan error + +type Reconciler struct { + proc.Doer + Action ReconcilerAction + explicit chan struct{} // send an empty struct to trigger explicit reconciliation + implicit chan struct{} // send an empty struct to trigger implicit reconciliation + done <-chan struct{} // close this when you want the reconciler to exit + cooldown time.Duration + explicitReconciliationAbortTimeout time.Duration +} + +func newReconciler(doer proc.Doer, action ReconcilerAction, + cooldown, explicitReconciliationAbortTimeout time.Duration, done <-chan struct{}) *Reconciler { + return &Reconciler{ + Doer: doer, + explicit: make(chan struct{}, 1), + implicit: make(chan struct{}, 1), + cooldown: cooldown, + explicitReconciliationAbortTimeout: explicitReconciliationAbortTimeout, + done: done, + Action: func(driver bindings.SchedulerDriver, cancel <-chan struct{}) <-chan error { + // trigged the reconciler action in the doer's execution context, + // but it could take a while and the scheduler needs to be able to + // process updates, the callbacks for which ALSO execute in the SAME + // deferred execution context -- so the action MUST be executed async. + errOnce := proc.NewErrorOnce(cancel) + return errOnce.Send(doer.Do(func() { + // only triggers the action if we're the currently elected, + // registered master and runs the action async. + go func() { + var err <-chan error + defer errOnce.Send(err) + err = action(driver, cancel) + }() + })).Err() + }, + } +} + +func (r *Reconciler) RequestExplicit() { + select { + case r.explicit <- struct{}{}: // noop + default: // request queue full; noop + } +} + +func (r *Reconciler) RequestImplicit() { + select { + case r.implicit <- struct{}{}: // noop + default: // request queue full; noop + } +} + +// execute task reconciliation, returns when r.done is closed. intended to run as a goroutine. +// if reconciliation is requested while another is in progress, the in-progress operation will be +// cancelled before the new reconciliation operation begins. +func (r *Reconciler) Run(driver bindings.SchedulerDriver) { + var cancel, finished chan struct{} +requestLoop: + for { + select { + case <-r.done: + return + default: // proceed + } + select { + case <-r.implicit: + metrics.ReconciliationRequested.WithLabelValues("implicit").Inc() + select { + case <-r.done: + return + case <-r.explicit: + break // give preference to a pending request for explicit + default: // continue + // don't run implicit reconciliation while explicit is ongoing + if finished != nil { + select { + case <-finished: // continue w/ implicit + default: + log.Infoln("skipping implicit reconcile because explicit reconcile is ongoing") + continue requestLoop + } + } + errOnce := proc.NewErrorOnce(r.done) + errCh := r.Do(func() { + var err error + defer errOnce.Report(err) + log.Infoln("implicit reconcile tasks") + metrics.ReconciliationExecuted.WithLabelValues("implicit").Inc() + if _, err = driver.ReconcileTasks([]*mesos.TaskStatus{}); err != nil { + log.V(1).Infof("failed to request implicit reconciliation from mesos: %v", err) + } + }) + proc.OnError(errOnce.Send(errCh).Err(), func(err error) { + log.Errorf("failed to run implicit reconciliation: %v", err) + }, r.done) + goto slowdown + } + case <-r.done: + return + case <-r.explicit: // continue + metrics.ReconciliationRequested.WithLabelValues("explicit").Inc() + } + + if cancel != nil { + close(cancel) + cancel = nil + + // play nice and wait for the prior operation to finish, complain + // if it doesn't + select { + case <-r.done: + return + case <-finished: // noop, expected + case <-time.After(r.explicitReconciliationAbortTimeout): // very unexpected + log.Error("reconciler action failed to stop upon cancellation") + } + } + // copy 'finished' to 'fin' here in case we end up with simultaneous go-routines, + // if cancellation takes too long or fails - we don't want to close the same chan + // more than once + cancel = make(chan struct{}) + finished = make(chan struct{}) + go func(fin chan struct{}) { + startedAt := time.Now() + defer func() { + metrics.ReconciliationLatency.Observe(metrics.InMicroseconds(time.Since(startedAt))) + }() + + metrics.ReconciliationExecuted.WithLabelValues("explicit").Inc() + defer close(fin) + err := <-r.Action(driver, cancel) + if err == reconciliationCancelledErr { + metrics.ReconciliationCancelled.WithLabelValues("explicit").Inc() + log.Infoln(err.Error()) + } else if err != nil { + log.Errorf("reconciler action failed: %v", err) + } + }(finished) + slowdown: + // don't allow reconciliation to run very frequently, either explicit or implicit + select { + case <-r.done: + return + case <-time.After(r.cooldown): // noop + } + } // for +} + +func (ks *KubernetesScheduler) recoverTasks() error { + ctx := api.NewDefaultContext() + podList, err := ks.client.Pods(api.NamespaceValue(ctx)).List(labels.Everything(), fields.Everything()) + if err != nil { + log.V(1).Infof("failed to recover pod registry, madness may ensue: %v", err) + return err + } + recoverSlave := func(t *podtask.T) { + + slaveId := t.Spec.SlaveID + ks.slaves.checkAndAdd(slaveId, t.Offer.Host()) + } + for _, pod := range podList.Items { + if t, ok, err := podtask.RecoverFrom(pod); err != nil { + log.Errorf("failed to recover task from pod, will attempt to delete '%v/%v': %v", pod.Namespace, pod.Name, err) + err := ks.client.Pods(pod.Namespace).Delete(pod.Name, nil) + //TODO(jdef) check for temporary or not-found errors + if err != nil { + log.Errorf("failed to delete pod '%v/%v': %v", pod.Namespace, pod.Name, err) + } + } else if ok { + ks.taskRegistry.Register(t, nil) + recoverSlave(t) + log.Infof("recovered task %v from pod %v/%v", t.ID, pod.Namespace, pod.Name) + } + } + return nil +} diff --git a/contrib/mesos/pkg/scheduler/scheduler_test.go b/contrib/mesos/pkg/scheduler/scheduler_test.go new file mode 100644 index 00000000000..e4fe2577e25 --- /dev/null +++ b/contrib/mesos/pkg/scheduler/scheduler_test.go @@ -0,0 +1,350 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package scheduler + +import ( + "testing" + + "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/offers" + "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/proc" + schedcfg "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/config" + "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/podtask" + mesos "github.com/mesos/mesos-go/mesosproto" + util "github.com/mesos/mesos-go/mesosutil" + "github.com/stretchr/testify/assert" +) + +// Check that same slave is only added once. +func TestSlaveStorage_checkAndAdd(t *testing.T) { + assert := assert.New(t) + + slaveStorage := newSlaveStorage() + assert.Equal(0, len(slaveStorage.slaves)) + + slaveId := "slave1" + slaveHostname := "slave1Hostname" + slaveStorage.checkAndAdd(slaveId, slaveHostname) + assert.Equal(1, len(slaveStorage.getSlaveIds())) + + slaveStorage.checkAndAdd(slaveId, slaveHostname) + assert.Equal(1, len(slaveStorage.getSlaveIds())) +} + +// Check that getSlave returns notExist for nonexisting slave. +func TestSlaveStorage_getSlave(t *testing.T) { + assert := assert.New(t) + + slaveStorage := newSlaveStorage() + assert.Equal(0, len(slaveStorage.slaves)) + + slaveId := "slave1" + slaveHostname := "slave1Hostname" + + _, exists := slaveStorage.getSlave(slaveId) + assert.Equal(false, exists) + + slaveStorage.checkAndAdd(slaveId, slaveHostname) + assert.Equal(1, len(slaveStorage.getSlaveIds())) + + _, exists = slaveStorage.getSlave(slaveId) + assert.Equal(true, exists) +} + +// Check that getSlaveIds returns array with all slaveIds. +func TestSlaveStorage_getSlaveIds(t *testing.T) { + assert := assert.New(t) + + slaveStorage := newSlaveStorage() + assert.Equal(0, len(slaveStorage.slaves)) + + slaveId := "1" + slaveHostname := "hn1" + slaveStorage.checkAndAdd(slaveId, slaveHostname) + assert.Equal(1, len(slaveStorage.getSlaveIds())) + + slaveId = "2" + slaveHostname = "hn2" + slaveStorage.checkAndAdd(slaveId, slaveHostname) + assert.Equal(2, len(slaveStorage.getSlaveIds())) + + slaveIds := slaveStorage.getSlaveIds() + + slaveIdsMap := make(map[string]bool, len(slaveIds)) + for _, s := range slaveIds { + slaveIdsMap[s] = true + } + + _, ok := slaveIdsMap["1"] + assert.Equal(ok, true) + + _, ok = slaveIdsMap["2"] + assert.Equal(ok, true) + +} + +//get number of non-expired offers from offer registry +func getNumberOffers(os offers.Registry) int { + //walk offers and check it is stored in registry + walked := 0 + walker1 := func(p offers.Perishable) (bool, error) { + walked++ + return false, nil + + } + os.Walk(walker1) + return walked +} + +//test adding of ressource offer, should be added to offer registry and slavesf +func TestResourceOffer_Add(t *testing.T) { + assert := assert.New(t) + + testScheduler := &KubernetesScheduler{ + offers: offers.CreateRegistry(offers.RegistryConfig{ + Compat: func(o *mesos.Offer) bool { + return true + }, + DeclineOffer: func(offerId string) <-chan error { + return proc.ErrorChan(nil) + }, + // remember expired offers so that we can tell if a previously scheduler offer relies on one + LingerTTL: schedcfg.DefaultOfferLingerTTL, + TTL: schedcfg.DefaultOfferTTL, + ListenerDelay: schedcfg.DefaultListenerDelay, + }), + slaves: newSlaveStorage(), + } + + hostname := "h1" + offerID1 := util.NewOfferID("test1") + offer1 := &mesos.Offer{Id: offerID1, Hostname: &hostname, SlaveId: util.NewSlaveID(hostname)} + offers1 := []*mesos.Offer{offer1} + testScheduler.ResourceOffers(nil, offers1) + + assert.Equal(1, getNumberOffers(testScheduler.offers)) + //check slave hostname + assert.Equal(1, len(testScheduler.slaves.getSlaveIds())) + + //add another offer + hostname2 := "h2" + offer2 := &mesos.Offer{Id: util.NewOfferID("test2"), Hostname: &hostname2, SlaveId: util.NewSlaveID(hostname2)} + offers2 := []*mesos.Offer{offer2} + testScheduler.ResourceOffers(nil, offers2) + + //check it is stored in registry + assert.Equal(2, getNumberOffers(testScheduler.offers)) + + //check slave hostnames + assert.Equal(2, len(testScheduler.slaves.getSlaveIds())) +} + +//test adding of ressource offer, should be added to offer registry and slavesf +func TestResourceOffer_Add_Rescind(t *testing.T) { + assert := assert.New(t) + + testScheduler := &KubernetesScheduler{ + offers: offers.CreateRegistry(offers.RegistryConfig{ + Compat: func(o *mesos.Offer) bool { + return true + }, + DeclineOffer: func(offerId string) <-chan error { + return proc.ErrorChan(nil) + }, + // remember expired offers so that we can tell if a previously scheduler offer relies on one + LingerTTL: schedcfg.DefaultOfferLingerTTL, + TTL: schedcfg.DefaultOfferTTL, + ListenerDelay: schedcfg.DefaultListenerDelay, + }), + slaves: newSlaveStorage(), + } + + hostname := "h1" + offerID1 := util.NewOfferID("test1") + offer1 := &mesos.Offer{Id: offerID1, Hostname: &hostname, SlaveId: util.NewSlaveID(hostname)} + offers1 := []*mesos.Offer{offer1} + testScheduler.ResourceOffers(nil, offers1) + + assert.Equal(1, getNumberOffers(testScheduler.offers)) + + //check slave hostname + assert.Equal(1, len(testScheduler.slaves.getSlaveIds())) + + //add another offer + hostname2 := "h2" + offer2 := &mesos.Offer{Id: util.NewOfferID("test2"), Hostname: &hostname2, SlaveId: util.NewSlaveID(hostname2)} + offers2 := []*mesos.Offer{offer2} + testScheduler.ResourceOffers(nil, offers2) + + assert.Equal(2, getNumberOffers(testScheduler.offers)) + + //check slave hostnames + assert.Equal(2, len(testScheduler.slaves.getSlaveIds())) + + //next whether offers can be rescinded + testScheduler.OfferRescinded(nil, offerID1) + assert.Equal(1, getNumberOffers(testScheduler.offers)) + + //next whether offers can be rescinded + testScheduler.OfferRescinded(nil, util.NewOfferID("test2")) + //walk offers again and check it is removed from registry + assert.Equal(0, getNumberOffers(testScheduler.offers)) + + //remove non existing ID + testScheduler.OfferRescinded(nil, util.NewOfferID("notExist")) +} + +//test that when a slave is lost we remove all offers +func TestSlave_Lost(t *testing.T) { + assert := assert.New(t) + + // + testScheduler := &KubernetesScheduler{ + offers: offers.CreateRegistry(offers.RegistryConfig{ + Compat: func(o *mesos.Offer) bool { + return true + }, + // remember expired offers so that we can tell if a previously scheduler offer relies on one + LingerTTL: schedcfg.DefaultOfferLingerTTL, + TTL: schedcfg.DefaultOfferTTL, + ListenerDelay: schedcfg.DefaultListenerDelay, + }), + slaves: newSlaveStorage(), + } + + hostname := "h1" + offer1 := &mesos.Offer{Id: util.NewOfferID("test1"), Hostname: &hostname, SlaveId: util.NewSlaveID(hostname)} + offers1 := []*mesos.Offer{offer1} + testScheduler.ResourceOffers(nil, offers1) + offer2 := &mesos.Offer{Id: util.NewOfferID("test2"), Hostname: &hostname, SlaveId: util.NewSlaveID(hostname)} + offers2 := []*mesos.Offer{offer2} + testScheduler.ResourceOffers(nil, offers2) + + //add another offer from different slaveID + hostname2 := "h2" + offer3 := &mesos.Offer{Id: util.NewOfferID("test3"), Hostname: &hostname2, SlaveId: util.NewSlaveID(hostname2)} + offers3 := []*mesos.Offer{offer3} + testScheduler.ResourceOffers(nil, offers3) + + //test precondition + assert.Equal(3, getNumberOffers(testScheduler.offers)) + assert.Equal(2, len(testScheduler.slaves.getSlaveIds())) + + //remove first slave + testScheduler.SlaveLost(nil, util.NewSlaveID(hostname)) + + //offers should be removed + assert.Equal(1, getNumberOffers(testScheduler.offers)) + //slave hostnames should still be all present + assert.Equal(2, len(testScheduler.slaves.getSlaveIds())) + + //remove second slave + testScheduler.SlaveLost(nil, util.NewSlaveID(hostname2)) + + //offers should be removed + assert.Equal(0, getNumberOffers(testScheduler.offers)) + //slave hostnames should still be all present + assert.Equal(2, len(testScheduler.slaves.getSlaveIds())) + + //try to remove non existing slave + testScheduler.SlaveLost(nil, util.NewSlaveID("notExist")) + +} + +//test when we loose connection to master we invalidate all cached offers +func TestDisconnect(t *testing.T) { + assert := assert.New(t) + + // + testScheduler := &KubernetesScheduler{ + offers: offers.CreateRegistry(offers.RegistryConfig{ + Compat: func(o *mesos.Offer) bool { + return true + }, + // remember expired offers so that we can tell if a previously scheduler offer relies on one + LingerTTL: schedcfg.DefaultOfferLingerTTL, + TTL: schedcfg.DefaultOfferTTL, + ListenerDelay: schedcfg.DefaultListenerDelay, + }), + slaves: newSlaveStorage(), + } + + hostname := "h1" + offer1 := &mesos.Offer{Id: util.NewOfferID("test1"), Hostname: &hostname, SlaveId: util.NewSlaveID(hostname)} + offers1 := []*mesos.Offer{offer1} + testScheduler.ResourceOffers(nil, offers1) + offer2 := &mesos.Offer{Id: util.NewOfferID("test2"), Hostname: &hostname, SlaveId: util.NewSlaveID(hostname)} + offers2 := []*mesos.Offer{offer2} + testScheduler.ResourceOffers(nil, offers2) + + //add another offer from different slaveID + hostname2 := "h2" + offer3 := &mesos.Offer{Id: util.NewOfferID("test2"), Hostname: &hostname2, SlaveId: util.NewSlaveID(hostname2)} + offers3 := []*mesos.Offer{offer3} + testScheduler.ResourceOffers(nil, offers3) + + //disconnect + testScheduler.Disconnected(nil) + + //all offers should be removed + assert.Equal(0, getNumberOffers(testScheduler.offers)) + //slave hostnames should still be all present + assert.Equal(2, len(testScheduler.slaves.getSlaveIds())) +} + +//test we can handle different status updates, TODO check state transitions +func TestStatus_Update(t *testing.T) { + + mockdriver := MockSchedulerDriver{} + // setup expectations + mockdriver.On("KillTask", util.NewTaskID("test-task-001")).Return(mesos.Status_DRIVER_RUNNING, nil) + + testScheduler := &KubernetesScheduler{ + offers: offers.CreateRegistry(offers.RegistryConfig{ + Compat: func(o *mesos.Offer) bool { + return true + }, + // remember expired offers so that we can tell if a previously scheduler offer relies on one + LingerTTL: schedcfg.DefaultOfferLingerTTL, + TTL: schedcfg.DefaultOfferTTL, + ListenerDelay: schedcfg.DefaultListenerDelay, + }), + slaves: newSlaveStorage(), + driver: &mockdriver, + taskRegistry: podtask.NewInMemoryRegistry(), + } + + taskStatus_task_starting := util.NewTaskStatus( + util.NewTaskID("test-task-001"), + mesos.TaskState_TASK_RUNNING, + ) + testScheduler.StatusUpdate(testScheduler.driver, taskStatus_task_starting) + + taskStatus_task_running := util.NewTaskStatus( + util.NewTaskID("test-task-001"), + mesos.TaskState_TASK_RUNNING, + ) + testScheduler.StatusUpdate(testScheduler.driver, taskStatus_task_running) + + taskStatus_task_failed := util.NewTaskStatus( + util.NewTaskID("test-task-001"), + mesos.TaskState_TASK_FAILED, + ) + testScheduler.StatusUpdate(testScheduler.driver, taskStatus_task_failed) + + //assert that mock was invoked + mockdriver.AssertExpectations(t) +} diff --git a/contrib/mesos/pkg/scheduler/service/compat_testing.go b/contrib/mesos/pkg/scheduler/service/compat_testing.go new file mode 100644 index 00000000000..8e7ba2bb033 --- /dev/null +++ b/contrib/mesos/pkg/scheduler/service/compat_testing.go @@ -0,0 +1,32 @@ +// +build unit_test + +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package service + +import ( + "os" + "syscall" +) + +func makeFailoverSigChan() <-chan os.Signal { + return nil +} + +func makeDisownedProcAttr() *syscall.SysProcAttr { + return nil +} diff --git a/contrib/mesos/pkg/scheduler/service/compat_unix.go b/contrib/mesos/pkg/scheduler/service/compat_unix.go new file mode 100644 index 00000000000..90d3bdeff18 --- /dev/null +++ b/contrib/mesos/pkg/scheduler/service/compat_unix.go @@ -0,0 +1,38 @@ +// +build darwin dragonfly freebsd linux netbsd openbsd +// +build !unit_test + +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package service + +import ( + "os" + "os/signal" + "syscall" +) + +func makeFailoverSigChan() <-chan os.Signal { + ch := make(chan os.Signal, 1) + signal.Notify(ch, syscall.SIGUSR1) + return ch +} + +func makeDisownedProcAttr() *syscall.SysProcAttr { + return &syscall.SysProcAttr{ + Setpgid: true, // disown the spawned scheduler + } +} diff --git a/contrib/mesos/pkg/scheduler/service/compat_windows.go b/contrib/mesos/pkg/scheduler/service/compat_windows.go new file mode 100644 index 00000000000..5ce9a5d7edd --- /dev/null +++ b/contrib/mesos/pkg/scheduler/service/compat_windows.go @@ -0,0 +1,51 @@ +// +build windows +// +build !unit_test + +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package service + +import ( + "os" + "syscall" +) + +func makeFailoverSigChan() <-chan os.Signal { + /* TODO(jdef) + from go's windows compatibility test, it looks like we need to provide a filtered + signal channel here + + c := make(chan os.Signal, 10) + signal.Notify(c) + select { + case s := <-c: + if s != os.Interrupt { + log.Fatalf("Wrong signal received: got %q, want %q\n", s, os.Interrupt) + } + case <-time.After(3 * time.Second): + log.Fatalf("Timeout waiting for Ctrl+Break\n") + } + */ + return nil +} + +func makeDisownedProcAttr() *syscall.SysProcAttr { + //TODO(jdef) test this somehow?!?! + return &syscall.SysProcAttr{ + CreationFlags: syscall.CREATE_NEW_PROCESS_GROUP | syscall.CREATE_UNICODE_ENVIRONMENT, + } +} diff --git a/contrib/mesos/pkg/scheduler/service/doc.go b/contrib/mesos/pkg/scheduler/service/doc.go new file mode 100644 index 00000000000..61ffbcecfff --- /dev/null +++ b/contrib/mesos/pkg/scheduler/service/doc.go @@ -0,0 +1,18 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package service contains the cmd/k8sm-scheduler glue code +package service diff --git a/contrib/mesos/pkg/scheduler/service/publish.go b/contrib/mesos/pkg/scheduler/service/publish.go new file mode 100644 index 00000000000..2a356e5af3e --- /dev/null +++ b/contrib/mesos/pkg/scheduler/service/publish.go @@ -0,0 +1,121 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package service + +import ( + "net" + "reflect" + "time" + + "github.com/GoogleCloudPlatform/kubernetes/pkg/api" + "github.com/GoogleCloudPlatform/kubernetes/pkg/api/errors" + "github.com/GoogleCloudPlatform/kubernetes/pkg/master/ports" + + "github.com/golang/glog" +) + +const ( + SCHEDULER_SERVICE_NAME = "k8sm-scheduler" +) + +func (m *SchedulerServer) newServiceWriter(stop <-chan struct{}) func() { + return func() { + for { + // Update service & endpoint records. + // TODO(k8s): when it becomes possible to change this stuff, + // stop polling and start watching. + if err := m.createSchedulerServiceIfNeeded(SCHEDULER_SERVICE_NAME, ports.SchedulerPort); err != nil { + glog.Errorf("Can't create scheduler service: %v", err) + } + + if err := m.setEndpoints(SCHEDULER_SERVICE_NAME, net.IP(m.Address), m.Port); err != nil { + glog.Errorf("Can't create scheduler endpoints: %v", err) + } + + select { + case <-stop: + return + case <-time.After(10 * time.Second): + } + } + } +} + +// createSchedulerServiceIfNeeded will create the specified service if it +// doesn't already exist. +func (m *SchedulerServer) createSchedulerServiceIfNeeded(serviceName string, servicePort int) error { + ctx := api.NewDefaultContext() + if _, err := m.client.Services(api.NamespaceValue(ctx)).Get(serviceName); err == nil { + // The service already exists. + return nil + } + svc := &api.Service{ + ObjectMeta: api.ObjectMeta{ + Name: serviceName, + Namespace: api.NamespaceDefault, + Labels: map[string]string{"provider": "k8sm", "component": "scheduler"}, + }, + Spec: api.ServiceSpec{ + Ports: []api.ServicePort{{Port: servicePort, Protocol: api.ProtocolTCP}}, + // maintained by this code, not by the pod selector + Selector: nil, + SessionAffinity: api.ServiceAffinityNone, + }, + } + if m.ServiceAddress != nil { + svc.Spec.ClusterIP = m.ServiceAddress.String() + } + _, err := m.client.Services(api.NamespaceValue(ctx)).Create(svc) + if err != nil && errors.IsAlreadyExists(err) { + err = nil + } + return err +} + +// setEndpoints sets the endpoints for the given service. +// in a multi-master scenario only the master will be publishing an endpoint. +// see SchedulerServer.bootstrap. +func (m *SchedulerServer) setEndpoints(serviceName string, ip net.IP, port int) error { + // The setting we want to find. + want := []api.EndpointSubset{{ + Addresses: []api.EndpointAddress{{IP: ip.String()}}, + Ports: []api.EndpointPort{{Port: port, Protocol: api.ProtocolTCP}}, + }} + + ctx := api.NewDefaultContext() + e, err := m.client.Endpoints(api.NamespaceValue(ctx)).Get(serviceName) + createOrUpdate := m.client.Endpoints(api.NamespaceValue(ctx)).Update + if err != nil { + if errors.IsNotFound(err) { + createOrUpdate = m.client.Endpoints(api.NamespaceValue(ctx)).Create + } + e = &api.Endpoints{ + ObjectMeta: api.ObjectMeta{ + Name: serviceName, + Namespace: api.NamespaceDefault, + }, + } + } + if !reflect.DeepEqual(e.Subsets, want) { + e.Subsets = want + glog.Infof("setting endpoints for master service %q to %#v", serviceName, e) + _, err = createOrUpdate(e) + return err + } + // We didn't make any changes, no need to actually call update. + return nil +} diff --git a/contrib/mesos/pkg/scheduler/service/service.go b/contrib/mesos/pkg/scheduler/service/service.go new file mode 100644 index 00000000000..8acc7517be8 --- /dev/null +++ b/contrib/mesos/pkg/scheduler/service/service.go @@ -0,0 +1,751 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package service + +import ( + "bufio" + "errors" + "fmt" + "io/ioutil" + "net" + "net/http" + "os" + "os/exec" + "os/user" + "strconv" + "strings" + "sync" + "time" + + "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/election" + execcfg "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/executor/config" + "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/hyperkube" + "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/profile" + "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/runtime" + "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler" + schedcfg "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/config" + "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/ha" + "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/meta" + "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/metrics" + "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/uid" + "github.com/GoogleCloudPlatform/kubernetes/pkg/client" + "github.com/GoogleCloudPlatform/kubernetes/pkg/clientauth" + "github.com/GoogleCloudPlatform/kubernetes/pkg/master/ports" + "github.com/GoogleCloudPlatform/kubernetes/pkg/tools" + "github.com/GoogleCloudPlatform/kubernetes/pkg/util" + "github.com/coreos/go-etcd/etcd" + "github.com/gogo/protobuf/proto" + log "github.com/golang/glog" + "github.com/kardianos/osext" + "github.com/mesos/mesos-go/auth" + "github.com/mesos/mesos-go/auth/sasl" + "github.com/mesos/mesos-go/auth/sasl/mech" + mesos "github.com/mesos/mesos-go/mesosproto" + mutil "github.com/mesos/mesos-go/mesosutil" + bindings "github.com/mesos/mesos-go/scheduler" + "github.com/prometheus/client_golang/prometheus" + "github.com/spf13/pflag" + "golang.org/x/net/context" +) + +const ( + defaultMesosMaster = "localhost:5050" + defaultMesosUser = "root" // should have privs to execute docker and iptables commands + defaultReconcileInterval = 300 // 5m default task reconciliation interval + defaultReconcileCooldown = 15 * time.Second + defaultFrameworkName = "Kubernetes" +) + +type SchedulerServer struct { + Port int + Address util.IP + EnableProfiling bool + AuthPath string + APIServerList util.StringList + EtcdServerList util.StringList + EtcdConfigFile string + AllowPrivileged bool + ExecutorPath string + ProxyPath string + MesosMaster string + MesosUser string + MesosRole string + MesosAuthPrincipal string + MesosAuthSecretFile string + Checkpoint bool + FailoverTimeout float64 + ExecutorBindall bool + ExecutorRunProxy bool + ExecutorProxyBindall bool + ExecutorLogV int + ExecutorSuicideTimeout time.Duration + MesosAuthProvider string + DriverPort uint + HostnameOverride string + ReconcileInterval int64 + ReconcileCooldown time.Duration + SchedulerConfigFileName string + Graceful bool + FrameworkName string + FrameworkWebURI string + HA bool + AdvertisedAddress string + ServiceAddress util.IP + HADomain string + KMPath string + ClusterDNS util.IP + ClusterDomain string + KubeletRootDirectory string + KubeletDockerEndpoint string + KubeletPodInfraContainerImage string + KubeletCadvisorPort uint + KubeletHostNetworkSources string + KubeletSyncFrequency time.Duration + KubeletNetworkPluginName string + + executable string // path to the binary running this service + client *client.Client + driver bindings.SchedulerDriver + driverMutex sync.RWMutex + mux *http.ServeMux +} + +// useful for unit testing specific funcs +type schedulerProcessInterface interface { + End() <-chan struct{} + Failover() <-chan struct{} + Terminal() <-chan struct{} +} + +// NewSchedulerServer creates a new SchedulerServer with default parameters +func NewSchedulerServer() *SchedulerServer { + s := SchedulerServer{ + Port: ports.SchedulerPort, + Address: util.IP(net.ParseIP("127.0.0.1")), + FailoverTimeout: time.Duration((1 << 62) - 1).Seconds(), + ExecutorRunProxy: true, + ExecutorSuicideTimeout: execcfg.DefaultSuicideTimeout, + MesosAuthProvider: sasl.ProviderName, + MesosMaster: defaultMesosMaster, + MesosUser: defaultMesosUser, + ReconcileInterval: defaultReconcileInterval, + ReconcileCooldown: defaultReconcileCooldown, + Checkpoint: true, + FrameworkName: defaultFrameworkName, + HA: false, + mux: http.NewServeMux(), + KubeletCadvisorPort: 4194, // copied from github.com/GoogleCloudPlatform/kubernetes/blob/release-0.14/cmd/kubelet/app/server.go + KubeletSyncFrequency: 10 * time.Second, + } + // cache this for later use. also useful in case the original binary gets deleted, e.g. + // during upgrades, development deployments, etc. + if filename, err := osext.Executable(); err != nil { + log.Fatalf("failed to determine path to currently running executable: %v", err) + } else { + s.executable = filename + s.KMPath = filename + } + + return &s +} + +func (s *SchedulerServer) addCoreFlags(fs *pflag.FlagSet) { + fs.IntVar(&s.Port, "port", s.Port, "The port that the scheduler's http service runs on") + fs.Var(&s.Address, "address", "The IP address to serve on (set to 0.0.0.0 for all interfaces)") + fs.BoolVar(&s.EnableProfiling, "profiling", s.EnableProfiling, "Enable profiling via web interface host:port/debug/pprof/") + fs.Var(&s.APIServerList, "api-servers", "List of Kubernetes API servers for publishing events, and reading pods and services. (ip:port), comma separated.") + fs.StringVar(&s.AuthPath, "auth-path", s.AuthPath, "Path to .kubernetes_auth file, specifying how to authenticate to API server.") + fs.Var(&s.EtcdServerList, "etcd-servers", "List of etcd servers to watch (http://ip:port), comma separated. Mutually exclusive with --etcd-config") + fs.StringVar(&s.EtcdConfigFile, "etcd-config", s.EtcdConfigFile, "The config file for the etcd client. Mutually exclusive with --etcd-servers.") + fs.BoolVar(&s.AllowPrivileged, "allow-privileged", s.AllowPrivileged, "If true, allow privileged containers.") + fs.StringVar(&s.ClusterDomain, "cluster-domain", s.ClusterDomain, "Domain for this cluster. If set, kubelet will configure all containers to search this domain in addition to the host's search domains") + fs.Var(&s.ClusterDNS, "cluster-dns", "IP address for a cluster DNS server. If set, kubelet will configure all containers to use this for DNS resolution in addition to the host's DNS servers") + + fs.StringVar(&s.MesosMaster, "mesos-master", s.MesosMaster, "Location of the Mesos master. The format is a comma-delimited list of of hosts like zk://host1:port,host2:port/mesos. If using ZooKeeper, pay particular attention to the leading zk:// and trailing /mesos! If not using ZooKeeper, standard URLs like http://localhost are also acceptable.") + fs.StringVar(&s.MesosUser, "mesos-user", s.MesosUser, "Mesos user for this framework, defaults to root.") + fs.StringVar(&s.MesosRole, "mesos-role", s.MesosRole, "Mesos role for this framework, defaults to none.") + fs.StringVar(&s.MesosAuthPrincipal, "mesos-authentication-principal", s.MesosAuthPrincipal, "Mesos authentication principal.") + fs.StringVar(&s.MesosAuthSecretFile, "mesos-authentication-secret-file", s.MesosAuthSecretFile, "Mesos authentication secret file.") + fs.StringVar(&s.MesosAuthProvider, "mesos-authentication-provider", s.MesosAuthProvider, fmt.Sprintf("Authentication provider to use, default is SASL that supports mechanisms: %+v", mech.ListSupported())) + fs.BoolVar(&s.Checkpoint, "checkpoint", s.Checkpoint, "Enable/disable checkpointing for the kubernetes-mesos framework.") + fs.Float64Var(&s.FailoverTimeout, "failover-timeout", s.FailoverTimeout, fmt.Sprintf("Framework failover timeout, in sec.")) + fs.UintVar(&s.DriverPort, "driver-port", s.DriverPort, "Port that the Mesos scheduler driver process should listen on.") + fs.StringVar(&s.HostnameOverride, "hostname-override", s.HostnameOverride, "If non-empty, will use this string as identification instead of the actual hostname.") + fs.Int64Var(&s.ReconcileInterval, "reconcile-interval", s.ReconcileInterval, "Interval at which to execute task reconciliation, in sec. Zero disables.") + fs.DurationVar(&s.ReconcileCooldown, "reconcile-cooldown", s.ReconcileCooldown, "Minimum rest period between task reconciliation operations.") + fs.StringVar(&s.SchedulerConfigFileName, "scheduler-config", s.SchedulerConfigFileName, "An ini-style configuration file with low-level scheduler settings.") + fs.BoolVar(&s.Graceful, "graceful", s.Graceful, "Indicator of a graceful failover, intended for internal use only.") + fs.BoolVar(&s.HA, "ha", s.HA, "Run the scheduler in high availability mode with leader election. All peers should be configured exactly the same.") + fs.StringVar(&s.FrameworkName, "framework-name", s.FrameworkName, "The framework name to register with Mesos.") + fs.StringVar(&s.FrameworkWebURI, "framework-weburi", s.FrameworkWebURI, "A URI that points to a web-based interface for interacting with the framework.") + fs.StringVar(&s.AdvertisedAddress, "advertised-address", s.AdvertisedAddress, "host:port address that is advertised to clients. May be used to construct artifact download URIs.") + fs.Var(&s.ServiceAddress, "service-address", "The service portal IP address that the scheduler should register with (if unset, chooses randomly)") + + fs.BoolVar(&s.ExecutorBindall, "executor-bindall", s.ExecutorBindall, "When true will set -address of the executor to 0.0.0.0.") + fs.IntVar(&s.ExecutorLogV, "executor-logv", s.ExecutorLogV, "Logging verbosity of spawned executor processes.") + fs.BoolVar(&s.ExecutorProxyBindall, "executor-proxy-bindall", s.ExecutorProxyBindall, "When true pass -proxy-bindall to the executor.") + fs.BoolVar(&s.ExecutorRunProxy, "executor-run-proxy", s.ExecutorRunProxy, "Run the kube-proxy as a child process of the executor.") + fs.DurationVar(&s.ExecutorSuicideTimeout, "executor-suicide-timeout", s.ExecutorSuicideTimeout, "Executor self-terminates after this period of inactivity. Zero disables suicide watch.") + + fs.StringVar(&s.KubeletRootDirectory, "kubelet-root-dir", s.KubeletRootDirectory, "Directory path for managing kubelet files (volume mounts,etc). Defaults to executor sandbox.") + fs.StringVar(&s.KubeletDockerEndpoint, "kubelet-docker-endpoint", s.KubeletDockerEndpoint, "If non-empty, kubelet will use this for the docker endpoint to communicate with.") + fs.StringVar(&s.KubeletPodInfraContainerImage, "kubelet-pod-infra-container-image", s.KubeletPodInfraContainerImage, "The image whose network/ipc namespaces containers in each pod will use.") + fs.UintVar(&s.KubeletCadvisorPort, "kubelet-cadvisor-port", s.KubeletCadvisorPort, "The port of the kubelet's local cAdvisor endpoint") + fs.StringVar(&s.KubeletHostNetworkSources, "kubelet-host-network-sources", s.KubeletHostNetworkSources, "Comma-separated list of sources from which the Kubelet allows pods to use of host network. For all sources use \"*\" [default=\"file\"]") + fs.DurationVar(&s.KubeletSyncFrequency, "kubelet-sync-frequency", s.KubeletSyncFrequency, "Max period between synchronizing running containers and config") + fs.StringVar(&s.KubeletNetworkPluginName, "kubelet-network-plugin", s.KubeletNetworkPluginName, " The name of the network plugin to be invoked for various events in kubelet/pod lifecycle") + + //TODO(jdef) support this flag once we have a better handle on mesos-dns and k8s DNS integration + //fs.StringVar(&s.HADomain, "ha-domain", s.HADomain, "Domain of the HA scheduler service, only used in HA mode. If specified may be used to construct artifact download URIs.") +} + +func (s *SchedulerServer) AddStandaloneFlags(fs *pflag.FlagSet) { + s.addCoreFlags(fs) + fs.StringVar(&s.ExecutorPath, "executor-path", s.ExecutorPath, "Location of the kubernetes executor executable") + fs.StringVar(&s.ProxyPath, "proxy-path", s.ProxyPath, "Location of the kubernetes proxy executable") +} + +func (s *SchedulerServer) AddHyperkubeFlags(fs *pflag.FlagSet) { + s.addCoreFlags(fs) + fs.StringVar(&s.KMPath, "km-path", s.KMPath, "Location of the km executable, may be a URI or an absolute file path.") +} + +// returns (downloadURI, basename(path)) +func (s *SchedulerServer) serveFrameworkArtifact(path string) (string, string) { + serveFile := func(pattern string, filename string) { + s.mux.HandleFunc(pattern, func(w http.ResponseWriter, r *http.Request) { + http.ServeFile(w, r, filename) + }) + } + + // Create base path (http://foobar:5000/) + pathSplit := strings.Split(path, "/") + var base string + if len(pathSplit) > 0 { + base = pathSplit[len(pathSplit)-1] + } else { + base = path + } + serveFile("/"+base, path) + + hostURI := "" + if s.AdvertisedAddress != "" { + hostURI = fmt.Sprintf("http://%s/%s", s.AdvertisedAddress, base) + } else if s.HA && s.HADomain != "" { + hostURI = fmt.Sprintf("http://%s.%s:%d/%s", SCHEDULER_SERVICE_NAME, s.HADomain, ports.SchedulerPort, base) + } else { + hostURI = fmt.Sprintf("http://%s:%d/%s", s.Address.String(), s.Port, base) + } + log.V(2).Infof("Hosting artifact '%s' at '%s'", path, hostURI) + + return hostURI, base +} + +func (s *SchedulerServer) prepareExecutorInfo(hks hyperkube.Interface) (*mesos.ExecutorInfo, *uid.UID, error) { + ci := &mesos.CommandInfo{ + Shell: proto.Bool(false), + } + + //TODO(jdef) these should be shared constants with km + const ( + KM_EXECUTOR = "executor" + KM_PROXY = "proxy" + ) + + if s.ExecutorPath != "" { + uri, executorCmd := s.serveFrameworkArtifact(s.ExecutorPath) + ci.Uris = append(ci.Uris, &mesos.CommandInfo_URI{Value: proto.String(uri), Executable: proto.Bool(true)}) + ci.Value = proto.String(fmt.Sprintf("./%s", executorCmd)) + } else if !hks.FindServer(KM_EXECUTOR) { + return nil, nil, fmt.Errorf("either run this scheduler via km or else --executor-path is required") + } else { + if strings.Index(s.KMPath, "://") > 0 { + // URI could point directly to executable, e.g. hdfs:///km + // or else indirectly, e.g. http://acmestorage/tarball.tgz + // so we assume that for this case the command will always "km" + ci.Uris = append(ci.Uris, &mesos.CommandInfo_URI{Value: proto.String(s.KMPath), Executable: proto.Bool(true)}) + ci.Value = proto.String("./km") // TODO(jdef) extract constant + } else if s.KMPath != "" { + uri, kmCmd := s.serveFrameworkArtifact(s.KMPath) + ci.Uris = append(ci.Uris, &mesos.CommandInfo_URI{Value: proto.String(uri), Executable: proto.Bool(true)}) + ci.Value = proto.String(fmt.Sprintf("./%s", kmCmd)) + } else { + uri, kmCmd := s.serveFrameworkArtifact(s.executable) + ci.Uris = append(ci.Uris, &mesos.CommandInfo_URI{Value: proto.String(uri), Executable: proto.Bool(true)}) + ci.Value = proto.String(fmt.Sprintf("./%s", kmCmd)) + } + ci.Arguments = append(ci.Arguments, KM_EXECUTOR) + } + + if s.ProxyPath != "" { + uri, proxyCmd := s.serveFrameworkArtifact(s.ProxyPath) + ci.Uris = append(ci.Uris, &mesos.CommandInfo_URI{Value: proto.String(uri), Executable: proto.Bool(true)}) + ci.Arguments = append(ci.Arguments, fmt.Sprintf("--proxy-exec=./%s", proxyCmd)) + } else if !hks.FindServer(KM_PROXY) { + return nil, nil, fmt.Errorf("either run this scheduler via km or else --proxy-path is required") + } else if s.ExecutorPath != "" { + return nil, nil, fmt.Errorf("proxy can only use km binary if executor does the same") + } // else, executor is smart enough to know when proxy-path is required, or to use km + + //TODO(jdef): provide some way (env var?) for users to customize executor config + //TODO(jdef): set -address to 127.0.0.1 if `address` is 127.0.0.1 + //TODO(jdef): propagate dockercfg from RootDirectory? + + apiServerArgs := strings.Join(s.APIServerList, ",") + ci.Arguments = append(ci.Arguments, fmt.Sprintf("--api-servers=%s", apiServerArgs)) + ci.Arguments = append(ci.Arguments, fmt.Sprintf("--v=%d", s.ExecutorLogV)) + ci.Arguments = append(ci.Arguments, fmt.Sprintf("--allow-privileged=%t", s.AllowPrivileged)) + ci.Arguments = append(ci.Arguments, fmt.Sprintf("--suicide-timeout=%v", s.ExecutorSuicideTimeout)) + + if s.ExecutorBindall { + //TODO(jdef) determine whether hostname-override is really needed for bindall because + //it conflicts with kubelet node status checks/updates + //ci.Arguments = append(ci.Arguments, "--hostname-override=0.0.0.0") + ci.Arguments = append(ci.Arguments, "--address=0.0.0.0") + } + + ci.Arguments = append(ci.Arguments, fmt.Sprintf("--proxy-bindall=%v", s.ExecutorProxyBindall)) + ci.Arguments = append(ci.Arguments, fmt.Sprintf("--run-proxy=%v", s.ExecutorRunProxy)) + ci.Arguments = append(ci.Arguments, fmt.Sprintf("--cadvisor-port=%v", s.KubeletCadvisorPort)) + ci.Arguments = append(ci.Arguments, fmt.Sprintf("--sync-frequency=%v", s.KubeletSyncFrequency)) + + if s.AuthPath != "" { + //TODO(jdef) should probably support non-local files, e.g. hdfs:///some/config/file + uri, basename := s.serveFrameworkArtifact(s.AuthPath) + ci.Uris = append(ci.Uris, &mesos.CommandInfo_URI{Value: proto.String(uri)}) + ci.Arguments = append(ci.Arguments, fmt.Sprintf("--auth-path=%s", basename)) + } + appendOptional := func(name string, value string) { + if value != "" { + ci.Arguments = append(ci.Arguments, fmt.Sprintf("--%s=%s", name, value)) + } + } + if s.ClusterDNS != nil { + appendOptional("cluster-dns", s.ClusterDNS.String()) + } + appendOptional("cluster-domain", s.ClusterDomain) + appendOptional("root-dir", s.KubeletRootDirectory) + appendOptional("docker-endpoint", s.KubeletDockerEndpoint) + appendOptional("pod-infra-container-image", s.KubeletPodInfraContainerImage) + appendOptional("host-network-sources", s.KubeletHostNetworkSources) + appendOptional("network-plugin", s.KubeletNetworkPluginName) + + log.V(1).Infof("prepared executor command %q with args '%+v'", ci.GetValue(), ci.Arguments) + + // Create mesos scheduler driver. + info := &mesos.ExecutorInfo{ + Command: ci, + Name: proto.String(execcfg.DefaultInfoName), + Source: proto.String(execcfg.DefaultInfoSource), + } + + // calculate ExecutorInfo hash to be used for validating compatibility + // of ExecutorInfo's generated by other HA schedulers. + ehash := hashExecutorInfo(info) + eid := uid.New(ehash, execcfg.DefaultInfoID) + info.ExecutorId = &mesos.ExecutorID{Value: proto.String(eid.String())} + + return info, eid, nil +} + +// TODO(jdef): hacked from kubelet/server/server.go +// TODO(k8s): replace this with clientcmd +func (s *SchedulerServer) createAPIServerClient() (*client.Client, error) { + authInfo, err := clientauth.LoadFromFile(s.AuthPath) + if err != nil { + log.Warningf("Could not load kubernetes auth path: %v. Continuing with defaults.", err) + } + if authInfo == nil { + // authInfo didn't load correctly - continue with defaults. + authInfo = &clientauth.Info{} + } + clientConfig, err := authInfo.MergeWithConfig(client.Config{}) + if err != nil { + return nil, err + } + if len(s.APIServerList) < 1 { + return nil, fmt.Errorf("no api servers specified") + } + // TODO: adapt Kube client to support LB over several servers + if len(s.APIServerList) > 1 { + log.Infof("Multiple api servers specified. Picking first one") + } + clientConfig.Host = s.APIServerList[0] + c, err := client.New(&clientConfig) + if err != nil { + return nil, err + } + return c, nil +} + +func (s *SchedulerServer) setDriver(driver bindings.SchedulerDriver) { + s.driverMutex.Lock() + defer s.driverMutex.Unlock() + s.driver = driver +} + +func (s *SchedulerServer) getDriver() (driver bindings.SchedulerDriver) { + s.driverMutex.RLock() + defer s.driverMutex.RUnlock() + return s.driver +} + +func (s *SchedulerServer) Run(hks hyperkube.Interface, _ []string) error { + // get scheduler low-level config + sc := schedcfg.CreateDefaultConfig() + if s.SchedulerConfigFileName != "" { + f, err := os.Open(s.SchedulerConfigFileName) + if err != nil { + log.Fatalf("Cannot open scheduler config file: %v", err) + } + + err = sc.Read(bufio.NewReader(f)) + if err != nil { + log.Fatalf("Invalid scheduler config file: %v", err) + } + } + + schedulerProcess, driverFactory, etcdClient, eid := s.bootstrap(hks, sc) + + if s.EnableProfiling { + profile.InstallHandler(s.mux) + } + go runtime.Until(func() { + log.V(1).Info("Starting HTTP interface") + log.Error(http.ListenAndServe(net.JoinHostPort(s.Address.String(), strconv.Itoa(s.Port)), s.mux)) + }, sc.HttpBindInterval.Duration, schedulerProcess.Terminal()) + + if s.HA { + validation := ha.ValidationFunc(validateLeadershipTransition) + srv := ha.NewCandidate(schedulerProcess, driverFactory, validation) + path := fmt.Sprintf(meta.DefaultElectionFormat, s.FrameworkName) + sid := uid.New(eid.Group(), "").String() + log.Infof("registering for election at %v with id %v", path, sid) + go election.Notify(election.NewEtcdMasterElector(etcdClient), path, sid, srv, nil) + } else { + log.Infoln("self-electing in non-HA mode") + schedulerProcess.Elect(driverFactory) + } + return s.awaitFailover(schedulerProcess, func() error { return s.failover(s.getDriver(), hks) }) +} + +// watch the scheduler process for failover signals and properly handle such. may never return. +func (s *SchedulerServer) awaitFailover(schedulerProcess schedulerProcessInterface, handler func() error) error { + + // we only want to return the first error (if any), everyone else can block forever + errCh := make(chan error, 1) + doFailover := func() error { + // we really don't expect handler to return, if it does something went seriously wrong + err := handler() + if err != nil { + defer schedulerProcess.End() + err = fmt.Errorf("failover failed, scheduler will terminate: %v", err) + } + return err + } + + // guard for failover signal processing, first signal processor wins + failoverLatch := &runtime.Latch{} + runtime.On(schedulerProcess.Terminal(), func() { + if !failoverLatch.Acquire() { + log.V(1).Infof("scheduler process ending, already failing over") + select {} + } + var err error + defer func() { errCh <- err }() + select { + case <-schedulerProcess.Failover(): + err = doFailover() + default: + if s.HA { + err = fmt.Errorf("ha scheduler exiting instead of failing over") + } else { + log.Infof("exiting scheduler") + } + } + }) + runtime.OnOSSignal(makeFailoverSigChan(), func(_ os.Signal) { + if !failoverLatch.Acquire() { + log.V(1).Infof("scheduler process signalled, already failing over") + select {} + } + errCh <- doFailover() + }) + return <-errCh +} + +func validateLeadershipTransition(desired, current string) { + log.Infof("validating leadership transition") + d := uid.Parse(desired).Group() + c := uid.Parse(current).Group() + if d == 0 { + // should *never* happen, but.. + log.Fatalf("illegal scheduler UID: %q", desired) + } + if d != c && c != 0 { + log.Fatalf("desired scheduler group (%x) != current scheduler group (%x)", d, c) + } +} + +// hacked from https://github.com/GoogleCloudPlatform/kubernetes/blob/release-0.14/cmd/kube-apiserver/app/server.go +func newEtcd(etcdConfigFile string, etcdServerList util.StringList) (client tools.EtcdGetSet, err error) { + if etcdConfigFile != "" { + client, err = etcd.NewClientFromFile(etcdConfigFile) + } else { + client = etcd.NewClient(etcdServerList) + } + return +} + +func (s *SchedulerServer) bootstrap(hks hyperkube.Interface, sc *schedcfg.Config) (*ha.SchedulerProcess, ha.DriverFactory, tools.EtcdGetSet, *uid.UID) { + + s.FrameworkName = strings.TrimSpace(s.FrameworkName) + if s.FrameworkName == "" { + log.Fatalf("framework-name must be a non-empty string") + } + s.FrameworkWebURI = strings.TrimSpace(s.FrameworkWebURI) + + metrics.Register() + runtime.Register() + s.mux.Handle("/metrics", prometheus.Handler()) + + if (s.EtcdConfigFile != "" && len(s.EtcdServerList) != 0) || (s.EtcdConfigFile == "" && len(s.EtcdServerList) == 0) { + log.Fatalf("specify either --etcd-servers or --etcd-config") + } + + if len(s.APIServerList) < 1 { + log.Fatal("No api servers specified.") + } + + client, err := s.createAPIServerClient() + if err != nil { + log.Fatalf("Unable to make apiserver client: %v", err) + } + s.client = client + + if s.ReconcileCooldown < defaultReconcileCooldown { + s.ReconcileCooldown = defaultReconcileCooldown + log.Warningf("user-specified reconcile cooldown too small, defaulting to %v", s.ReconcileCooldown) + } + + executor, eid, err := s.prepareExecutorInfo(hks) + if err != nil { + log.Fatalf("misconfigured executor: %v", err) + } + + // TODO(jdef): remove the dependency on etcd as soon as + // (1) the generic config store is available for the FrameworkId storage + // (2) the generic master election is provided by the apiserver + // Compare docs/proposals/high-availability.md + etcdClient, err := newEtcd(s.EtcdConfigFile, s.EtcdServerList) + if err != nil { + log.Fatalf("misconfigured etcd: %v", err) + } + + mesosPodScheduler := scheduler.New(scheduler.Config{ + Schedcfg: *sc, + Executor: executor, + ScheduleFunc: scheduler.FCFSScheduleFunc, + Client: client, + EtcdClient: etcdClient, + FailoverTimeout: s.FailoverTimeout, + ReconcileInterval: s.ReconcileInterval, + ReconcileCooldown: s.ReconcileCooldown, + }) + + masterUri := s.MesosMaster + info, cred, err := s.buildFrameworkInfo() + if err != nil { + log.Fatalf("Misconfigured mesos framework: %v", err) + } + + schedulerProcess := ha.New(mesosPodScheduler) + dconfig := &bindings.DriverConfig{ + Scheduler: schedulerProcess, + Framework: info, + Master: masterUri, + Credential: cred, + BindingAddress: net.IP(s.Address), + BindingPort: uint16(s.DriverPort), + HostnameOverride: s.HostnameOverride, + WithAuthContext: func(ctx context.Context) context.Context { + ctx = auth.WithLoginProvider(ctx, s.MesosAuthProvider) + ctx = sasl.WithBindingAddress(ctx, net.IP(s.Address)) + return ctx + }, + } + + kpl := scheduler.NewPlugin(mesosPodScheduler.NewDefaultPluginConfig(schedulerProcess.Terminal(), s.mux)) + runtime.On(mesosPodScheduler.Registration(), func() { kpl.Run(schedulerProcess.Terminal()) }) + runtime.On(mesosPodScheduler.Registration(), s.newServiceWriter(schedulerProcess.Terminal())) + + driverFactory := ha.DriverFactory(func() (drv bindings.SchedulerDriver, err error) { + log.V(1).Infoln("performing deferred initialization") + if err = mesosPodScheduler.Init(schedulerProcess.Master(), kpl, s.mux); err != nil { + return nil, fmt.Errorf("failed to initialize pod scheduler: %v", err) + } + log.V(1).Infoln("deferred init complete") + // defer obtaining framework ID to prevent multiple schedulers + // from overwriting each other's framework IDs + dconfig.Framework.Id, err = s.fetchFrameworkID(etcdClient) + if err != nil { + return nil, fmt.Errorf("failed to fetch framework ID from etcd: %v", err) + } + log.V(1).Infoln("constructing mesos scheduler driver") + drv, err = bindings.NewMesosSchedulerDriver(*dconfig) + if err != nil { + return nil, fmt.Errorf("failed to construct scheduler driver: %v", err) + } + log.V(1).Infoln("constructed mesos scheduler driver:", drv) + s.setDriver(drv) + return drv, nil + }) + + return schedulerProcess, driverFactory, etcdClient, eid +} + +func (s *SchedulerServer) failover(driver bindings.SchedulerDriver, hks hyperkube.Interface) error { + if driver != nil { + stat, err := driver.Stop(true) + if stat != mesos.Status_DRIVER_STOPPED { + return fmt.Errorf("failed to stop driver for failover, received unexpected status code: %v", stat) + } else if err != nil { + return err + } + } + + // there's no guarantee that all goroutines are actually programmed intelligently with 'done' + // signals, so we'll need to restart if we want to really stop everything + + // run the same command that we were launched with + //TODO(jdef) assumption here is that the sheduler is the only service running in this process, we should probably validate that somehow + args := []string{} + flags := pflag.CommandLine + if hks != nil { + args = append(args, hks.Name()) + flags = hks.Flags() + } + flags.Visit(func(flag *pflag.Flag) { + if flag.Name != "api-servers" && flag.Name != "etcd-servers" { + args = append(args, fmt.Sprintf("--%s=%s", flag.Name, flag.Value.String())) + } + }) + if !s.Graceful { + args = append(args, "--graceful") + } + if len(s.APIServerList) > 0 { + args = append(args, "--api-servers="+strings.Join(s.APIServerList, ",")) + } + if len(s.EtcdServerList) > 0 { + args = append(args, "--etcd-servers="+strings.Join(s.EtcdServerList, ",")) + } + args = append(args, flags.Args()...) + + log.V(1).Infof("spawning scheduler for graceful failover: %s %+v", s.executable, args) + + cmd := exec.Command(s.executable, args...) + cmd.Stdin = os.Stdin + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + cmd.SysProcAttr = makeDisownedProcAttr() + + // TODO(jdef) pass in a pipe FD so that we can block, waiting for the child proc to be ready + //cmd.ExtraFiles = []*os.File{} + + exitcode := 0 + log.Flush() // TODO(jdef) it would be really nice to ensure that no one else in our process was still logging + if err := cmd.Start(); err != nil { + //log to stdtout here to avoid conflicts with normal stderr logging + fmt.Fprintf(os.Stdout, "failed to spawn failover process: %v\n", err) + os.Exit(1) + } + os.Exit(exitcode) + select {} // will never reach here +} + +func (s *SchedulerServer) buildFrameworkInfo() (info *mesos.FrameworkInfo, cred *mesos.Credential, err error) { + username, err := s.getUsername() + if err != nil { + return nil, nil, err + } + log.V(2).Infof("Framework configured with mesos user %v", username) + info = &mesos.FrameworkInfo{ + Name: proto.String(s.FrameworkName), + User: proto.String(username), + Checkpoint: proto.Bool(s.Checkpoint), + } + if s.FrameworkWebURI != "" { + info.WebuiUrl = proto.String(s.FrameworkWebURI) + } + if s.FailoverTimeout > 0 { + info.FailoverTimeout = proto.Float64(s.FailoverTimeout) + } + if s.MesosRole != "" { + info.Role = proto.String(s.MesosRole) + } + if s.MesosAuthPrincipal != "" { + info.Principal = proto.String(s.MesosAuthPrincipal) + if s.MesosAuthSecretFile == "" { + return nil, nil, errors.New("authentication principal specified without the required credentials file") + } + secret, err := ioutil.ReadFile(s.MesosAuthSecretFile) + if err != nil { + return nil, nil, err + } + cred = &mesos.Credential{ + Principal: proto.String(s.MesosAuthPrincipal), + Secret: secret, + } + } + return +} + +func (s *SchedulerServer) fetchFrameworkID(client tools.EtcdGetSet) (*mesos.FrameworkID, error) { + if s.FailoverTimeout > 0 { + if response, err := client.Get(meta.FrameworkIDKey, false, false); err != nil { + if !tools.IsEtcdNotFound(err) { + return nil, fmt.Errorf("unexpected failure attempting to load framework ID from etcd: %v", err) + } + log.V(1).Infof("did not find framework ID in etcd") + } else if response.Node.Value != "" { + log.Infof("configuring FrameworkInfo with Id found in etcd: '%s'", response.Node.Value) + return mutil.NewFrameworkID(response.Node.Value), nil + } + } else { + //TODO(jdef) this seems like a totally hackish way to clean up the framework ID + if _, err := client.Delete(meta.FrameworkIDKey, true); err != nil { + if !tools.IsEtcdNotFound(err) { + return nil, fmt.Errorf("failed to delete framework ID from etcd: %v", err) + } + log.V(1).Infof("nothing to delete: did not find framework ID in etcd") + } + } + return nil, nil +} + +func (s *SchedulerServer) getUsername() (username string, err error) { + username = s.MesosUser + if username == "" { + if u, err := user.Current(); err == nil { + username = u.Username + if username == "" { + username = defaultMesosUser + } + } + } + return +} diff --git a/contrib/mesos/pkg/scheduler/service/service_test.go b/contrib/mesos/pkg/scheduler/service/service_test.go new file mode 100644 index 00000000000..5db9c6726c1 --- /dev/null +++ b/contrib/mesos/pkg/scheduler/service/service_test.go @@ -0,0 +1,108 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// +build unit_test + +package service + +import ( + "testing" + "time" +) + +type fakeSchedulerProcess struct { + doneFunc func() <-chan struct{} + failoverFunc func() <-chan struct{} +} + +func (self *fakeSchedulerProcess) Terminal() <-chan struct{} { + if self == nil || self.doneFunc == nil { + return nil + } + return self.doneFunc() +} + +func (self *fakeSchedulerProcess) Failover() <-chan struct{} { + if self == nil || self.failoverFunc == nil { + return nil + } + return self.failoverFunc() +} + +func (self *fakeSchedulerProcess) End() <-chan struct{} { + ch := make(chan struct{}) + close(ch) + return ch +} + +func Test_awaitFailoverDone(t *testing.T) { + done := make(chan struct{}) + p := &fakeSchedulerProcess{ + doneFunc: func() <-chan struct{} { return done }, + } + ss := &SchedulerServer{} + failoverHandlerCalled := false + failoverFailedHandler := func() error { + failoverHandlerCalled = true + return nil + } + errCh := make(chan error, 1) + go func() { + errCh <- ss.awaitFailover(p, failoverFailedHandler) + }() + close(done) + select { + case err := <-errCh: + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + case <-time.After(1 * time.Second): + t.Fatalf("timed out waiting for failover") + } + if failoverHandlerCalled { + t.Fatalf("unexpected call to failover handler") + } +} + +func Test_awaitFailoverDoneFailover(t *testing.T) { + ch := make(chan struct{}) + p := &fakeSchedulerProcess{ + doneFunc: func() <-chan struct{} { return ch }, + failoverFunc: func() <-chan struct{} { return ch }, + } + ss := &SchedulerServer{} + failoverHandlerCalled := false + failoverFailedHandler := func() error { + failoverHandlerCalled = true + return nil + } + errCh := make(chan error, 1) + go func() { + errCh <- ss.awaitFailover(p, failoverFailedHandler) + }() + close(ch) + select { + case err := <-errCh: + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + case <-time.After(1 * time.Second): + t.Fatalf("timed out waiting for failover") + } + if !failoverHandlerCalled { + t.Fatalf("expected call to failover handler") + } +} diff --git a/contrib/mesos/pkg/scheduler/service/util.go b/contrib/mesos/pkg/scheduler/service/util.go new file mode 100644 index 00000000000..33b4a1057f8 --- /dev/null +++ b/contrib/mesos/pkg/scheduler/service/util.go @@ -0,0 +1,88 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package service + +import ( + "bytes" + "fmt" + "hash/crc64" + "sort" + "strconv" + + mesos "github.com/mesos/mesos-go/mesosproto" +) + +// compute a hashcode for ExecutorInfo that may be used as a reasonable litmus test +// with respect to compatibility across HA schedulers. the intent is that an HA scheduler +// should fail-fast if it doesn't pass this test, rather than generating (potentially many) +// errors at run-time because a Mesos master decides that the ExecutorInfo generated by a +// secondary scheduler doesn't match that of the primary scheduler. +// +// see https://github.com/apache/mesos/blob/0.22.0/src/common/type_utils.cpp#L110 +func hashExecutorInfo(info *mesos.ExecutorInfo) uint64 { + // !!! we specifically do NOT include: + // - Framework ID because it's a value that's initialized too late for us to use + // - Executor ID because it's a value that includes a copy of this hash + buf := &bytes.Buffer{} + buf.WriteString(info.GetName()) + buf.WriteString(info.GetSource()) + buf.Write(info.Data) + + if info.Command != nil { + buf.WriteString(info.Command.GetValue()) + buf.WriteString(info.Command.GetUser()) + buf.WriteString(strconv.FormatBool(info.Command.GetShell())) + if sz := len(info.Command.Arguments); sz > 0 { + x := make([]string, sz) + copy(x, info.Command.Arguments) + sort.Strings(x) + for _, item := range x { + buf.WriteString(item) + } + } + if vars := info.Command.Environment.GetVariables(); vars != nil && len(vars) > 0 { + names := []string{} + e := make(map[string]string) + + for _, v := range vars { + if name := v.GetName(); name != "" { + names = append(names, name) + e[name] = v.GetValue() + } + } + sort.Strings(names) + for _, n := range names { + buf.WriteString(n) + buf.WriteString("=") + buf.WriteString(e[n]) + } + } + if uris := info.Command.GetUris(); len(uris) > 0 { + su := []string{} + for _, uri := range uris { + su = append(su, fmt.Sprintf("%s%t%t", uri.GetValue(), uri.GetExecutable(), uri.GetExtract())) + } + sort.Strings(su) + for _, uri := range su { + buf.WriteString(uri) + } + } + //TODO(jdef) add support for Resources and Container + } + table := crc64.MakeTable(crc64.ECMA) + return crc64.Checksum(buf.Bytes(), table) +} diff --git a/contrib/mesos/pkg/scheduler/types.go b/contrib/mesos/pkg/scheduler/types.go new file mode 100644 index 00000000000..a8ab9bd399a --- /dev/null +++ b/contrib/mesos/pkg/scheduler/types.go @@ -0,0 +1,49 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package scheduler + +import ( + "errors" + + "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/offers" + "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/podtask" +) + +// PodScheduleFunc implements how to schedule pods among slaves. +// We can have different implementation for different scheduling policy. +// +// The Schedule function accepts a group of slaves (each contains offers from +// that slave) and a single pod, which aligns well with the k8s scheduling +// algorithm. It returns an offerId that is acceptable for the pod, otherwise +// nil. The caller is responsible for filling in task state w/ relevant offer +// details. +// +// See the FCFSScheduleFunc for example. +type PodScheduleFunc func(r offers.Registry, slaves SlaveIndex, task *podtask.T) (offers.Perishable, error) + +// A minimal placeholder +type empty struct{} + +var ( + noSuitableOffersErr = errors.New("No suitable offers for pod/task") + noSuchPodErr = errors.New("No such pod exists") + noSuchTaskErr = errors.New("No such task exists") +) + +type SlaveIndex interface { + slaveFor(id string) (*Slave, bool) +} diff --git a/contrib/mesos/pkg/scheduler/uid/doc.go b/contrib/mesos/pkg/scheduler/uid/doc.go new file mode 100644 index 00000000000..cc8c35432cc --- /dev/null +++ b/contrib/mesos/pkg/scheduler/uid/doc.go @@ -0,0 +1,18 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package uid encapsulates unique identifiers code used by the scheduler. +package uid diff --git a/contrib/mesos/pkg/scheduler/uid/uid.go b/contrib/mesos/pkg/scheduler/uid/uid.go new file mode 100644 index 00000000000..37f4701d373 --- /dev/null +++ b/contrib/mesos/pkg/scheduler/uid/uid.go @@ -0,0 +1,85 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package uid + +import ( + "fmt" + "strconv" + "strings" + + "code.google.com/p/go-uuid/uuid" + log "github.com/golang/glog" +) + +type UID struct { + group uint64 + name string + ser string +} + +func New(group uint64, name string) *UID { + if name == "" { + name = uuid.New() + } + return &UID{ + group: group, + name: name, + ser: fmt.Sprintf("%x_%s", group, name), + } +} + +func (self *UID) Name() string { + if self != nil { + return self.name + } + return "" +} + +func (self *UID) Group() uint64 { + if self != nil { + return self.group + } + return 0 +} + +func (self *UID) String() string { + if self != nil { + return self.ser + } + return "" +} + +func Parse(ser string) *UID { + parts := strings.SplitN(ser, "_", 2) + if len(parts) != 2 { + return nil + } + group, err := strconv.ParseUint(parts[0], 16, 64) + if err != nil { + log.Errorf("illegal UID group %q: %v", parts[0], err) + return nil + } + if parts[1] == "" { + log.Errorf("missing UID name: %q", ser) + return nil + } + return &UID{ + group: group, + name: parts[1], + ser: ser, + } +} diff --git a/contrib/mesos/pkg/scheduler/uid/uid_test.go b/contrib/mesos/pkg/scheduler/uid/uid_test.go new file mode 100644 index 00000000000..67e60fdf14a --- /dev/null +++ b/contrib/mesos/pkg/scheduler/uid/uid_test.go @@ -0,0 +1,47 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package uid + +import ( + "testing" +) + +func TestUID_Parse(t *testing.T) { + valid := []string{"1234567890abcdef_foo", "123_bar", "face_time"} + groups := []uint64{0x1234567890abcdef, 0x123, 0xface} + + for i, good := range valid { + u := Parse(good) + if u == nil { + t.Errorf("expected parsed UID, not nil") + } + if groups[i] != u.Group() { + t.Errorf("expected matching group instead of %x", u.Group()) + } + if good != u.String() { + t.Errorf("expected %q instead of %q", good, u.String()) + } + } + + invalid := []string{"", "bad"} + for _, bad := range invalid { + u := Parse(bad) + if u != nil { + t.Errorf("expected nil UID instead of %v", u) + } + } +} diff --git a/contrib/mesos/target.sh b/contrib/mesos/target.sh new file mode 100644 index 00000000000..aaa9e8adac5 --- /dev/null +++ b/contrib/mesos/target.sh @@ -0,0 +1,43 @@ +#!/bin/bash + +# Copyright 2014 The Kubernetes Authors All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# The set of server targets that we are only building for Linux +# Used by hack/lib/golang.sh +kube::contrib::mesos::server_targets() { + local -r targets=( + contrib/mesos/cmd/k8sm-scheduler + contrib/mesos/cmd/k8sm-executor + ) + echo "${targets[@]}" +} + +# The set of test targets that we are building for all platforms +# Used by hack/lib/golang.sh +kube::contrib::mesos::test_targets() { + local -r targets=( + contrib/mesos/cmd/k8sm-redirfd + ) + echo "${targets[@]}" +} + +# The set of source targets to include in the kube-build image +# Used by build/common.sh +kube::contrib::mesos::source_targets() { + local -r targets=( + contrib/mesos + ) + echo "${targets[@]}" +}