Skip to content

Commit adeb054

Browse files
committed
chore: improve addon controller job management to prevent blocking updates
- Add job timeout (5 minutes default) to prevent indefinite hanging - Implement generation tracking for jobs to enable cleanup of outdated jobs - Add automatic cleanup of outdated jobs when addon generation changes - Ensure new addon updates can proceed without waiting for stuck jobs - Add comprehensive test coverage for job cleanup scenarios - Use constants for annotation keys to improve maintainability This resolves issues where addon updates were blocked by stuck jobs due to image pull failures or other pod startup issues.
1 parent 4de60f2 commit adeb054

File tree

7 files changed

+117
-0
lines changed

7 files changed

+117
-0
lines changed

cmd/manager/main.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -266,6 +266,13 @@ func validateRequiredToParseConfigs() error {
266266
return err
267267
}
268268
}
269+
270+
if jobTimeout := viper.GetString(constant.CfgAddonJobTimeout); jobTimeout != "" {
271+
if _, err := time.ParseDuration(jobTimeout); err != nil {
272+
return err
273+
}
274+
}
275+
269276
if err := validateTolerations(viper.GetString(constant.CfgKeyCtrlrMgrTolerations)); err != nil {
270277
return err
271278
}

controllers/extensions/addon_controller_stages.go

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import (
2424
"encoding/json"
2525
"fmt"
2626
"slices"
27+
"strconv"
2728
"strings"
2829
"time"
2930

@@ -543,6 +544,18 @@ func (r *helmTypeInstallStage) Handle(ctx context.Context) {
543544
r.setRequeueWithErr(err, "")
544545
return
545546
} else if err == nil {
547+
// Check if the job is outdated (belongs to an older generation)
548+
if isJobOutdated(helmInstallJob, addon) {
549+
r.reqCtx.Log.Info("Deleting outdated install job", "job", key.Name, "jobGeneration",
550+
helmInstallJob.Annotations[AddonGeneration], "addonGeneration", addon.Generation)
551+
if err := r.reconciler.Delete(ctx, helmInstallJob); client.IgnoreNotFound(err) != nil {
552+
r.setRequeueWithErr(err, "")
553+
return
554+
}
555+
r.setRequeueAfter(time.Second, "recreating install job for new generation")
556+
return
557+
}
558+
546559
if helmInstallJob.Status.Succeeded > 0 {
547560
return
548561
}
@@ -710,6 +723,18 @@ func (r *helmTypeUninstallStage) Handle(ctx context.Context) {
710723
r.setRequeueWithErr(err, "")
711724
return
712725
} else if err == nil {
726+
// Check if the job is outdated (belongs to an older generation)
727+
if isJobOutdated(helmUninstallJob, addon) {
728+
r.reqCtx.Log.Info("Deleting outdated uninstall job", "job", key.Name, "jobGeneration",
729+
helmUninstallJob.Annotations[AddonGeneration], "addonGeneration", addon.Generation)
730+
if err := r.reconciler.Delete(ctx, helmUninstallJob); client.IgnoreNotFound(err) != nil {
731+
r.setRequeueWithErr(err, "")
732+
return
733+
}
734+
r.setRequeueAfter(time.Second, "recreating uninstall job for new generation")
735+
return
736+
}
737+
713738
if helmUninstallJob.Status.Succeeded > 0 {
714739
r.reqCtx.Log.V(1).Info("helm uninstall job succeed", "job", key)
715740
// TODO:
@@ -911,6 +936,17 @@ func createHelmJobProto(addon *extensionsv1alpha1.Addon) (*batchv1.Job, error) {
911936
}
912937
ttlSec := int32(ttl.Seconds())
913938
backoffLimit := int32(3)
939+
940+
// Set job timeout to prevent jobs from running indefinitely
941+
jobTimeout := time.Minute * 5
942+
if timeout := viper.GetString(constant.CfgAddonJobTimeout); timeout != "" {
943+
var err error
944+
if jobTimeout, err = time.ParseDuration(timeout); err != nil {
945+
return nil, err
946+
}
947+
}
948+
activeDeadlineSeconds := int64(jobTimeout.Seconds())
949+
914950
container := corev1.Container{
915951
Name: getJobMainContainerName(addon),
916952
Image: viper.GetString(constant.KBToolsImage),
@@ -940,10 +976,15 @@ func createHelmJobProto(addon *extensionsv1alpha1.Addon) (*batchv1.Job, error) {
940976
constant.AddonNameLabelKey: addon.Name,
941977
constant.AppManagedByLabelKey: constant.AppName,
942978
},
979+
Annotations: map[string]string{
980+
// Add generation annotation to track which addon generation this job belongs to
981+
AddonGeneration: fmt.Sprintf("%d", addon.Generation),
982+
},
943983
},
944984
Spec: batchv1.JobSpec{
945985
BackoffLimit: &backoffLimit,
946986
TTLSecondsAfterFinished: &ttlSec,
987+
ActiveDeadlineSeconds: &activeDeadlineSeconds,
947988
Template: corev1.PodTemplateSpec{
948989
ObjectMeta: metav1.ObjectMeta{
949990
Labels: map[string]string{
@@ -1075,6 +1116,26 @@ func getJobMainContainerName(addon *extensionsv1alpha1.Addon) string {
10751116
return strings.ToLower(string(addon.Spec.Type))
10761117
}
10771118

1119+
// isJobOutdated checks if the job belongs to an older generation of the addon
1120+
func isJobOutdated(job *batchv1.Job, addon *extensionsv1alpha1.Addon) bool {
1121+
if job.Annotations == nil {
1122+
// Jobs without generation annotation are considered outdated
1123+
return true
1124+
}
1125+
1126+
jobGenStr, exists := job.Annotations[AddonGeneration]
1127+
if !exists {
1128+
return true
1129+
}
1130+
1131+
jobGen, err := strconv.ParseInt(jobGenStr, 10, 64)
1132+
if err != nil {
1133+
return true
1134+
}
1135+
1136+
return jobGen < addon.Generation
1137+
}
1138+
10781139
func logFailedJobPodToCondError(ctx context.Context, stageCtx *stageCtx, addon *extensionsv1alpha1.Addon,
10791140
jobName, reason string) error {
10801141
podList := &corev1.PodList{}

controllers/extensions/addon_controller_test.go

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -964,6 +964,50 @@ var _ = Describe("Addon controller", func() {
964964
g.Expect(addon.Status.Phase).Should(Equal(extensionsv1alpha1.AddonEnabling))
965965
}).Should(Succeed())
966966
})
967+
968+
It("should cleanup outdated jobs when addon is updated", func() {
969+
By("By create an addon with auto-install")
970+
createAddonSpecWithRequiredAttributes(func(newOjb *extensionsv1alpha1.Addon) {
971+
newOjb.Spec.Installable.AutoInstall = true
972+
})
973+
974+
By("By addon autoInstall auto added")
975+
enablingPhaseCheck(2)
976+
977+
By("By checking install job is created with generation annotation")
978+
jobKey := client.ObjectKey{
979+
Namespace: viper.GetString(constant.CfgKeyCtrlrMgrNS),
980+
Name: getInstallJobName(addon),
981+
}
982+
Eventually(func(g Gomega) {
983+
job := &batchv1.Job{}
984+
g.Expect(testCtx.Cli.Get(ctx, jobKey, job)).Should(Succeed())
985+
g.Expect(job.Annotations).Should(HaveKey(AddonGeneration))
986+
g.Expect(job.Annotations[AddonGeneration]).Should(Equal("2"))
987+
g.Expect(job.Spec.ActiveDeadlineSeconds).ShouldNot(BeNil())
988+
g.Expect(*job.Spec.ActiveDeadlineSeconds).Should(Equal(int64(300)))
989+
}).Should(Succeed())
990+
991+
By("By updating addon to trigger new generation")
992+
addon = &extensionsv1alpha1.Addon{}
993+
Expect(testCtx.Cli.Get(ctx, key, addon)).To(Not(HaveOccurred()))
994+
addon.Spec.InstallSpec.Enabled = false
995+
Expect(testCtx.Cli.Update(ctx, addon)).Should(Succeed())
996+
addon.Spec.InstallSpec.Enabled = true
997+
Expect(testCtx.Cli.Update(ctx, addon)).Should(Succeed())
998+
999+
By("By checking old job is deleted and new job is created")
1000+
Eventually(func(g Gomega) {
1001+
_, err := doReconcile()
1002+
g.Expect(err).To(Not(HaveOccurred()))
1003+
job := &batchv1.Job{}
1004+
err = testCtx.Cli.Get(ctx, jobKey, job)
1005+
if err == nil {
1006+
// If job exists, it should have the new generation
1007+
g.Expect(job.Annotations[AddonGeneration]).Should(Equal("4"))
1008+
}
1009+
}).Should(Succeed())
1010+
})
9671011
})
9681012

9691013
Context("Addon controller SetupWithManager", func() {

controllers/extensions/const.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ const (
2929
NoDeleteJobs = "extensions.kubeblocks.io/no-delete-jobs"
3030
AddonDefaultIsEmpty = "addons.extensions.kubeblocks.io/default-is-empty"
3131
KBVersionValidate = "addon.kubeblocks.io/kubeblocks-version"
32+
AddonGeneration = "addon.kubeblocks.io/generation"
3233

3334
// label keys
3435
AddonProvider = "addon.kubeblocks.io/provider"

deploy/helm/templates/deployment.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,8 @@ spec:
163163
value: {{ .jobTTL | quote }}
164164
- name: ADDON_JOB_IMAGE_PULL_POLICY
165165
value: {{ .jobImagePullPolicy | default "IfNotPresent" }}
166+
- name: ADDON_JOB_TIMEOUT
167+
value: {{ .jobTimeout | quote }}
166168
{{- end }}
167169
- name: KUBEBLOCKS_ADDON_HELM_INSTALL_OPTIONS
168170
value: {{ join " " .Values.addonHelmInstallOptions }}

deploy/helm/values.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -467,6 +467,7 @@ addonController:
467467
enabled: true
468468
jobTTL: "5m"
469469
jobImagePullPolicy: IfNotPresent
470+
jobTimeout: "5m"
470471

471472

472473
## @param keepAddons - keep Addon CR objects when delete this chart.

pkg/constant/viper_config.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ const (
3535
// addon config keys
3636
CfgKeyAddonJobTTL = "ADDON_JOB_TTL"
3737
CfgAddonJobImgPullPolicy = "ADDON_JOB_IMAGE_PULL_POLICY"
38+
CfgAddonJobTimeout = "ADDON_JOB_TIMEOUT"
3839

3940
// addon charts config keys
4041
CfgAddonChartsImgPullPolicy = "KUBEBLOCKS_ADDON_CHARTS_IMAGE_PULL_POLICY"

0 commit comments

Comments
 (0)