mirror of
https://github.com/k3s-io/kubernetes.git
synced 2026-01-13 11:25:19 +00:00
Modify CronJob API to add job history limits, cleanup jobs in controller
This commit is contained in:
@@ -30,6 +30,7 @@ Just periodically list jobs and SJs, and then reconcile them.
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"sort"
|
||||
"time"
|
||||
|
||||
"github.com/golang/glog"
|
||||
@@ -92,13 +93,13 @@ func (jm *CronJobController) Run(stopCh <-chan struct{}) {
|
||||
defer utilruntime.HandleCrash()
|
||||
glog.Infof("Starting CronJob Manager")
|
||||
// Check things every 10 second.
|
||||
go wait.Until(jm.SyncAll, 10*time.Second, stopCh)
|
||||
go wait.Until(jm.syncAll, 10*time.Second, stopCh)
|
||||
<-stopCh
|
||||
glog.Infof("Shutting down CronJob Manager")
|
||||
}
|
||||
|
||||
// SyncAll lists all the CronJobs and Jobs and reconciles them.
|
||||
func (jm *CronJobController) SyncAll() {
|
||||
// syncAll lists all the CronJobs and Jobs and reconciles them.
|
||||
func (jm *CronJobController) syncAll() {
|
||||
sjl, err := jm.kubeClient.BatchV2alpha1().CronJobs(metav1.NamespaceAll).List(metav1.ListOptions{})
|
||||
if err != nil {
|
||||
glog.Errorf("Error listing cronjobs: %v", err)
|
||||
@@ -119,24 +120,86 @@ func (jm *CronJobController) SyncAll() {
|
||||
glog.V(4).Infof("Found %d groups", len(jobsBySj))
|
||||
|
||||
for _, sj := range sjs {
|
||||
SyncOne(sj, jobsBySj[sj.UID], time.Now(), jm.jobControl, jm.sjControl, jm.podControl, jm.recorder)
|
||||
syncOne(&sj, jobsBySj[sj.UID], time.Now(), jm.jobControl, jm.sjControl, jm.podControl, jm.recorder)
|
||||
cleanupFinishedJobs(&sj, jobsBySj[sj.UID], jm.jobControl, jm.sjControl, jm.podControl, jm.recorder)
|
||||
}
|
||||
}
|
||||
|
||||
// SyncOne reconciles a CronJob with a list of any Jobs that it created.
|
||||
// cleanupFinishedJobs cleanups finished jobs created by a CronJob
|
||||
func cleanupFinishedJobs(sj *batch.CronJob, js []batch.Job, jc jobControlInterface, sjc sjControlInterface, pc podControlInterface, recorder record.EventRecorder) {
|
||||
// If neither limits are active, there is no need to do anything.
|
||||
if sj.Spec.FailedJobsHistoryLimit == nil && sj.Spec.SuccessfulJobsHistoryLimit == nil {
|
||||
return
|
||||
}
|
||||
|
||||
failedJobs := []batch.Job{}
|
||||
succesfulJobs := []batch.Job{}
|
||||
|
||||
for _, job := range js {
|
||||
isFinished, finishedStatus := getFinishedStatus(&job)
|
||||
if isFinished && finishedStatus == batch.JobComplete {
|
||||
succesfulJobs = append(succesfulJobs, job)
|
||||
} else if isFinished && finishedStatus == batch.JobFailed {
|
||||
failedJobs = append(failedJobs, job)
|
||||
}
|
||||
}
|
||||
|
||||
if sj.Spec.SuccessfulJobsHistoryLimit != nil {
|
||||
removeOldestJobs(sj,
|
||||
succesfulJobs,
|
||||
jc,
|
||||
pc,
|
||||
*sj.Spec.SuccessfulJobsHistoryLimit,
|
||||
recorder)
|
||||
}
|
||||
|
||||
if sj.Spec.FailedJobsHistoryLimit != nil {
|
||||
removeOldestJobs(sj,
|
||||
failedJobs,
|
||||
jc,
|
||||
pc,
|
||||
*sj.Spec.FailedJobsHistoryLimit,
|
||||
recorder)
|
||||
}
|
||||
|
||||
// Update the CronJob, in case jobs were removed from the list.
|
||||
if _, err := sjc.UpdateStatus(sj); err != nil {
|
||||
nameForLog := fmt.Sprintf("%s/%s", sj.Namespace, sj.Name)
|
||||
glog.Infof("Unable to update status for %s (rv = %s): %v", nameForLog, sj.ResourceVersion, err)
|
||||
}
|
||||
}
|
||||
|
||||
// removeOldestJobs removes the oldest jobs from a list of jobs
|
||||
func removeOldestJobs(sj *batch.CronJob, js []batch.Job, jc jobControlInterface, pc podControlInterface, maxJobs int32, recorder record.EventRecorder) {
|
||||
numToDelete := len(js) - int(maxJobs)
|
||||
if numToDelete <= 0 {
|
||||
return
|
||||
}
|
||||
|
||||
nameForLog := fmt.Sprintf("%s/%s", sj.Namespace, sj.Name)
|
||||
glog.V(4).Infof("Cleaning up %d/%d jobs from %s", numToDelete, len(js), nameForLog)
|
||||
|
||||
sort.Sort(byJobStartTime(js))
|
||||
for i := 0; i < numToDelete; i++ {
|
||||
glog.V(4).Infof("Removing job %s from %s", js[i].Name, nameForLog)
|
||||
deleteJob(sj, &js[i], jc, pc, recorder, "history limit reached")
|
||||
}
|
||||
}
|
||||
|
||||
// syncOne reconciles a CronJob with a list of any Jobs that it created.
|
||||
// All known jobs created by "sj" should be included in "js".
|
||||
// The current time is passed in to facilitate testing.
|
||||
// It has no receiver, to facilitate testing.
|
||||
func SyncOne(sj batch.CronJob, js []batch.Job, now time.Time, jc jobControlInterface, sjc sjControlInterface, pc podControlInterface, recorder record.EventRecorder) {
|
||||
func syncOne(sj *batch.CronJob, js []batch.Job, now time.Time, jc jobControlInterface, sjc sjControlInterface, pc podControlInterface, recorder record.EventRecorder) {
|
||||
nameForLog := fmt.Sprintf("%s/%s", sj.Namespace, sj.Name)
|
||||
|
||||
childrenJobs := make(map[types.UID]bool)
|
||||
for i := range js {
|
||||
j := js[i]
|
||||
childrenJobs[j.ObjectMeta.UID] = true
|
||||
found := inActiveList(sj, j.ObjectMeta.UID)
|
||||
found := inActiveList(*sj, j.ObjectMeta.UID)
|
||||
if !found && !IsJobFinished(&j) {
|
||||
recorder.Eventf(&sj, v1.EventTypeWarning, "UnexpectedJob", "Saw a job that the controller did not create or forgot: %v", j.Name)
|
||||
recorder.Eventf(sj, v1.EventTypeWarning, "UnexpectedJob", "Saw a job that the controller did not create or forgot: %v", j.Name)
|
||||
// We found an unfinished job that has us as the parent, but it is not in our Active list.
|
||||
// This could happen if we crashed right after creating the Job and before updating the status,
|
||||
// or if our jobs list is newer than our sj status after a relist, or if someone intentionally created
|
||||
@@ -148,9 +211,9 @@ func SyncOne(sj batch.CronJob, js []batch.Job, now time.Time, jc jobControlInter
|
||||
// in the same namespace "adopt" that job. ReplicaSets and their Pods work the same way.
|
||||
// TBS: how to update sj.Status.LastScheduleTime if the adopted job is newer than any we knew about?
|
||||
} else if found && IsJobFinished(&j) {
|
||||
deleteFromActiveList(&sj, j.ObjectMeta.UID)
|
||||
deleteFromActiveList(sj, j.ObjectMeta.UID)
|
||||
// TODO: event to call out failure vs success.
|
||||
recorder.Eventf(&sj, v1.EventTypeNormal, "SawCompletedJob", "Saw completed job: %v", j.Name)
|
||||
recorder.Eventf(sj, v1.EventTypeNormal, "SawCompletedJob", "Saw completed job: %v", j.Name)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -159,25 +222,25 @@ func SyncOne(sj batch.CronJob, js []batch.Job, now time.Time, jc jobControlInter
|
||||
// job running.
|
||||
for _, j := range sj.Status.Active {
|
||||
if found := childrenJobs[j.UID]; !found {
|
||||
recorder.Eventf(&sj, v1.EventTypeNormal, "MissingJob", "Active job went missing: %v", j.Name)
|
||||
deleteFromActiveList(&sj, j.UID)
|
||||
recorder.Eventf(sj, v1.EventTypeNormal, "MissingJob", "Active job went missing: %v", j.Name)
|
||||
deleteFromActiveList(sj, j.UID)
|
||||
}
|
||||
}
|
||||
|
||||
updatedSJ, err := sjc.UpdateStatus(&sj)
|
||||
updatedSJ, err := sjc.UpdateStatus(sj)
|
||||
if err != nil {
|
||||
glog.Errorf("Unable to update status for %s (rv = %s): %v", nameForLog, sj.ResourceVersion, err)
|
||||
return
|
||||
}
|
||||
sj = *updatedSJ
|
||||
*sj = *updatedSJ
|
||||
|
||||
if sj.Spec.Suspend != nil && *sj.Spec.Suspend {
|
||||
glog.V(4).Infof("Not starting job for %s because it is suspended", nameForLog)
|
||||
return
|
||||
}
|
||||
times, err := getRecentUnmetScheduleTimes(sj, now)
|
||||
times, err := getRecentUnmetScheduleTimes(*sj, now)
|
||||
if err != nil {
|
||||
recorder.Eventf(&sj, v1.EventTypeWarning, "FailedNeedsStart", "Cannot determine if job needs to be started: %v", err)
|
||||
recorder.Eventf(sj, v1.EventTypeWarning, "FailedNeedsStart", "Cannot determine if job needs to be started: %v", err)
|
||||
glog.Errorf("Cannot determine if %s needs to be started: %v", nameForLog, err)
|
||||
}
|
||||
// TODO: handle multiple unmet start times, from oldest to newest, updating status as needed.
|
||||
@@ -224,73 +287,37 @@ func SyncOne(sj batch.CronJob, js []batch.Job, now time.Time, jc jobControlInter
|
||||
// TODO: this should be replaced with server side job deletion
|
||||
// currently this mimics JobReaper from pkg/kubectl/stop.go
|
||||
glog.V(4).Infof("Deleting job %s of %s that was still running at next scheduled start time", j.Name, nameForLog)
|
||||
|
||||
job, err := jc.GetJob(j.Namespace, j.Name)
|
||||
if err != nil {
|
||||
recorder.Eventf(&sj, v1.EventTypeWarning, "FailedGet", "Get job: %v", err)
|
||||
recorder.Eventf(sj, v1.EventTypeWarning, "FailedGet", "Get job: %v", err)
|
||||
return
|
||||
}
|
||||
// scale job down to 0
|
||||
if *job.Spec.Parallelism != 0 {
|
||||
zero := int32(0)
|
||||
job.Spec.Parallelism = &zero
|
||||
job, err = jc.UpdateJob(job.Namespace, job)
|
||||
if err != nil {
|
||||
recorder.Eventf(&sj, v1.EventTypeWarning, "FailedUpdate", "Update job: %v", err)
|
||||
return
|
||||
}
|
||||
}
|
||||
// remove all pods...
|
||||
selector, _ := metav1.LabelSelectorAsSelector(job.Spec.Selector)
|
||||
options := metav1.ListOptions{LabelSelector: selector.String()}
|
||||
podList, err := pc.ListPods(job.Namespace, options)
|
||||
if err != nil {
|
||||
recorder.Eventf(&sj, v1.EventTypeWarning, "FailedList", "List job-pods: %v", err)
|
||||
}
|
||||
errList := []error{}
|
||||
for _, pod := range podList.Items {
|
||||
glog.V(2).Infof("CronJob controller is deleting Pod %v/%v", pod.Namespace, pod.Name)
|
||||
if err := pc.DeletePod(pod.Namespace, pod.Name); err != nil {
|
||||
// ignores the error when the pod isn't found
|
||||
if !errors.IsNotFound(err) {
|
||||
errList = append(errList, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
if len(errList) != 0 {
|
||||
recorder.Eventf(&sj, v1.EventTypeWarning, "FailedDelete", "Deleted job-pods: %v", utilerrors.NewAggregate(errList))
|
||||
if !deleteJob(sj, job, jc, pc, recorder, "") {
|
||||
return
|
||||
}
|
||||
// ... the job itself...
|
||||
if err := jc.DeleteJob(job.Namespace, job.Name); err != nil {
|
||||
recorder.Eventf(&sj, v1.EventTypeWarning, "FailedDelete", "Deleted job: %v", err)
|
||||
glog.Errorf("Error deleting job %s from %s: %v", job.Name, nameForLog, err)
|
||||
return
|
||||
}
|
||||
// ... and its reference from active list
|
||||
deleteFromActiveList(&sj, job.ObjectMeta.UID)
|
||||
recorder.Eventf(&sj, v1.EventTypeNormal, "SuccessfulDelete", "Deleted job %v", j.Name)
|
||||
}
|
||||
}
|
||||
|
||||
jobReq, err := getJobFromTemplate(&sj, scheduledTime)
|
||||
jobReq, err := getJobFromTemplate(sj, scheduledTime)
|
||||
if err != nil {
|
||||
glog.Errorf("Unable to make Job from template in %s: %v", nameForLog, err)
|
||||
return
|
||||
}
|
||||
jobResp, err := jc.CreateJob(sj.Namespace, jobReq)
|
||||
if err != nil {
|
||||
recorder.Eventf(&sj, v1.EventTypeWarning, "FailedCreate", "Error creating job: %v", err)
|
||||
recorder.Eventf(sj, v1.EventTypeWarning, "FailedCreate", "Error creating job: %v", err)
|
||||
return
|
||||
}
|
||||
glog.V(4).Infof("Created Job %s for %s", jobResp.Name, nameForLog)
|
||||
recorder.Eventf(&sj, v1.EventTypeNormal, "SuccessfulCreate", "Created job %v", jobResp.Name)
|
||||
recorder.Eventf(sj, v1.EventTypeNormal, "SuccessfulCreate", "Created job %v", jobResp.Name)
|
||||
|
||||
// ------------------------------------------------------------------ //
|
||||
|
||||
// If this process restarts at this point (after posting a job, but
|
||||
// before updating the status), then we might try to start the job on
|
||||
// the next time. Actually, if we relist the SJs and Jobs on the next
|
||||
// iteration of SyncAll, we might not see our own status update, and
|
||||
// iteration of syncAll, we might not see our own status update, and
|
||||
// then post one again. So, we need to use the job name as a lock to
|
||||
// prevent us from making the job twice (name the job with hash of its
|
||||
// scheduled time).
|
||||
@@ -303,13 +330,64 @@ func SyncOne(sj batch.CronJob, js []batch.Job, now time.Time, jc jobControlInter
|
||||
sj.Status.Active = append(sj.Status.Active, *ref)
|
||||
}
|
||||
sj.Status.LastScheduleTime = &metav1.Time{Time: scheduledTime}
|
||||
if _, err := sjc.UpdateStatus(&sj); err != nil {
|
||||
if _, err := sjc.UpdateStatus(sj); err != nil {
|
||||
glog.Infof("Unable to update status for %s (rv = %s): %v", nameForLog, sj.ResourceVersion, err)
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
// deleteJob reaps a job, deleting the job, the pobs and the reference in the active list
|
||||
func deleteJob(sj *batch.CronJob, job *batch.Job, jc jobControlInterface, pc podControlInterface, recorder record.EventRecorder, reason string) bool {
|
||||
// TODO: this should be replaced with server side job deletion
|
||||
// currencontinuetly this mimics JobReaper from pkg/kubectl/stop.go
|
||||
nameForLog := fmt.Sprintf("%s/%s", sj.Namespace, sj.Name)
|
||||
var err error
|
||||
|
||||
// scale job down to 0
|
||||
if *job.Spec.Parallelism != 0 {
|
||||
zero := int32(0)
|
||||
job.Spec.Parallelism = &zero
|
||||
job, err = jc.UpdateJob(job.Namespace, job)
|
||||
if err != nil {
|
||||
recorder.Eventf(sj, v1.EventTypeWarning, "FailedUpdate", "Update job: %v", err)
|
||||
return false
|
||||
}
|
||||
}
|
||||
// remove all pods...
|
||||
selector, _ := metav1.LabelSelectorAsSelector(job.Spec.Selector)
|
||||
options := metav1.ListOptions{LabelSelector: selector.String()}
|
||||
podList, err := pc.ListPods(job.Namespace, options)
|
||||
if err != nil {
|
||||
recorder.Eventf(sj, v1.EventTypeWarning, "FailedList", "List job-pods: %v", err)
|
||||
}
|
||||
errList := []error{}
|
||||
for _, pod := range podList.Items {
|
||||
glog.V(2).Infof("CronJob controller is deleting Pod %v/%v", pod.Namespace, pod.Name)
|
||||
if err := pc.DeletePod(pod.Namespace, pod.Name); err != nil {
|
||||
// ignores the error when the pod isn't found
|
||||
if !errors.IsNotFound(err) {
|
||||
errList = append(errList, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
if len(errList) != 0 {
|
||||
recorder.Eventf(sj, v1.EventTypeWarning, "FailedDelete", "Deleted job-pods: %v", utilerrors.NewAggregate(errList))
|
||||
return false
|
||||
}
|
||||
// ... the job itself...
|
||||
if err := jc.DeleteJob(job.Namespace, job.Name); err != nil {
|
||||
recorder.Eventf(sj, v1.EventTypeWarning, "FailedDelete", "Deleted job: %v", err)
|
||||
glog.Errorf("Error deleting job %s from %s: %v", job.Name, nameForLog, err)
|
||||
return false
|
||||
}
|
||||
// ... and its reference from active list
|
||||
deleteFromActiveList(sj, job.ObjectMeta.UID)
|
||||
recorder.Eventf(sj, v1.EventTypeNormal, "SuccessfulDelete", "Deleted job %v", job.Name)
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
func getRef(object runtime.Object) (*v1.ObjectReference, error) {
|
||||
return v1.GetReference(api.Scheme, object)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user