Skip to content

Commit 65e3adf

Browse files
committed
fix: record failed lifecycle operations
Adds the ability to record partial failures from resource lifecycle methods by adding a new `ResourceData.Error` field. This field is populated when an error occurs during `Create` and `Update` events. In this case, we'll persist failed resource back to the state before we terminate the plan. Combined with the existing `NeedsCreate` field, we can record when an operation needs to be retried on the next run. Recording partial failures also makes it so resources can be properly deleted if the user opts to perform a delete rather than a retry. PLAT-212
1 parent 9eb45c7 commit 65e3adf

File tree

8 files changed

+198
-24
lines changed

8 files changed

+198
-24
lines changed
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
kind: Fixed
2+
body: Fixed a bug that prevented database deletion when we failed to create the Swarm service.
3+
time: 2026-01-19T09:57:16.746487-05:00

server/internal/resource/event.go

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,13 @@ type Event struct {
3535
Diff jsondiff.Patch `json:"diff,omitempty"`
3636
}
3737

38+
func (e *Event) ResourceError() error {
39+
if e.Resource != nil && e.Resource.Error != "" {
40+
return errors.New(e.Resource.Error)
41+
}
42+
return nil
43+
}
44+
3845
// Apply applies this event to its resource. It does not modify the state in the
3946
// given Context.
4047
func (e *Event) Apply(ctx context.Context, rc *Context) error {
@@ -58,7 +65,10 @@ func (e *Event) Apply(ctx context.Context, rc *Context) error {
5865
}
5966

6067
func (e *Event) refresh(ctx context.Context, rc *Context, resource Resource) error {
61-
var needsRecreate bool
68+
// Retain the original Error and NeedsRecreate fields so that they're
69+
// available for planCreates.
70+
needsRecreate := e.Resource.NeedsRecreate
71+
applyErr := e.Resource.Error
6272

6373
err := resource.Refresh(ctx, rc)
6474
if errors.Is(err, ErrNotFound) {
@@ -71,37 +81,48 @@ func (e *Event) refresh(ctx context.Context, rc *Context, resource Resource) err
7181
if err != nil {
7282
return err
7383
}
84+
7485
updated.NeedsRecreate = needsRecreate
86+
updated.Error = applyErr
7587

7688
e.Resource = updated
7789

7890
return nil
7991
}
8092

8193
func (e *Event) create(ctx context.Context, rc *Context, resource Resource) error {
94+
var needsRecreate bool
95+
var applyErr string
96+
8297
if err := resource.Create(ctx, rc); err != nil {
83-
return fmt.Errorf("failed to create resource %s: %w", resource.Identifier(), err)
98+
needsRecreate = true
99+
applyErr = fmt.Sprintf("failed to create resource %s: %s", resource.Identifier(), err.Error())
84100
}
85101

86102
updated, err := ToResourceData(resource)
87103
if err != nil {
88104
return err
89105
}
106+
updated.NeedsRecreate = needsRecreate
107+
updated.Error = applyErr
90108

91109
e.Resource = updated
92110

93111
return nil
94112
}
95113

96114
func (e *Event) update(ctx context.Context, rc *Context, resource Resource) error {
115+
var applyErr string
116+
97117
if err := resource.Update(ctx, rc); err != nil {
98-
return fmt.Errorf("failed to update resource %s: %w", resource.Identifier(), err)
118+
applyErr = fmt.Sprintf("failed to update resource %s: %s", resource.Identifier(), err.Error())
99119
}
100120

101121
updated, err := ToResourceData(resource)
102122
if err != nil {
103123
return err
104124
}
125+
updated.Error = applyErr
105126

106127
e.Resource = updated
107128

@@ -110,6 +131,10 @@ func (e *Event) update(ctx context.Context, rc *Context, resource Resource) erro
110131

111132
func (e *Event) delete(ctx context.Context, rc *Context, resource Resource) error {
112133
if err := resource.Delete(ctx, rc); err != nil {
134+
// We need to return an error here to indicate that this event should
135+
// not be applied to the state. Applying a delete event to the state
136+
// removes the resource, so if we didn't return the error it would be
137+
// impossible to retry this operation.
113138
return fmt.Errorf("failed to delete resource %s: %w", resource.Identifier(), err)
114139
}
115140

server/internal/resource/event_test.go

Lines changed: 41 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -24,13 +24,24 @@ func TestEvent(t *testing.T) {
2424
eventType resource.EventType
2525
notFound bool
2626
lifecycleError string
27+
originalResourceError string
28+
originalResourceNeedsRecreate bool
2729
expectedErr string
30+
expectedResourceError string
2831
expectedResourceNeedsRecreate bool
2932
}{
3033
{
3134
name: "refresh success",
3235
eventType: resource.EventTypeRefresh,
3336
},
37+
{
38+
name: "refresh success retains Error and NeedsRecreate",
39+
eventType: resource.EventTypeRefresh,
40+
originalResourceError: "some error",
41+
originalResourceNeedsRecreate: true,
42+
expectedResourceError: "some error",
43+
expectedResourceNeedsRecreate: true,
44+
},
3445
{
3546
name: "refresh not found",
3647
eventType: resource.EventTypeRefresh,
@@ -48,25 +59,44 @@ func TestEvent(t *testing.T) {
4859
eventType: resource.EventTypeCreate,
4960
},
5061
{
51-
name: "create failed",
52-
eventType: resource.EventTypeCreate,
53-
lifecycleError: "some error",
54-
expectedErr: "failed to create resource test_resource::test: some error",
62+
name: "create success clears Error and NeedsRecreate",
63+
eventType: resource.EventTypeCreate,
64+
originalResourceError: "some error",
65+
originalResourceNeedsRecreate: true,
66+
},
67+
{
68+
name: "create failed",
69+
eventType: resource.EventTypeCreate,
70+
lifecycleError: "some error",
71+
expectedResourceError: "failed to create resource test_resource::test: some error",
72+
expectedResourceNeedsRecreate: true,
5573
},
5674
{
5775
name: "update success",
5876
eventType: resource.EventTypeUpdate,
5977
},
6078
{
61-
name: "update failed",
62-
eventType: resource.EventTypeUpdate,
63-
lifecycleError: "some error",
64-
expectedErr: "failed to update resource test_resource::test: some error",
79+
name: "update success clears Error and NeedsRecreate",
80+
eventType: resource.EventTypeUpdate,
81+
originalResourceError: "some error",
82+
originalResourceNeedsRecreate: true,
83+
},
84+
{
85+
name: "update failed",
86+
eventType: resource.EventTypeUpdate,
87+
lifecycleError: "some error",
88+
expectedResourceError: "failed to update resource test_resource::test: some error",
6589
},
6690
{
6791
name: "delete success",
6892
eventType: resource.EventTypeDelete,
6993
},
94+
{
95+
name: "delete success clears Error and NeedsRecreate",
96+
eventType: resource.EventTypeDelete,
97+
originalResourceError: "some error",
98+
originalResourceNeedsRecreate: true,
99+
},
70100
{
71101
name: "delete failed",
72102
eventType: resource.EventTypeDelete,
@@ -82,8 +112,11 @@ func TestEvent(t *testing.T) {
82112
}
83113

84114
original := r.data(t)
115+
original.Error = tc.originalResourceError
116+
original.NeedsRecreate = tc.originalResourceNeedsRecreate
85117

86118
expected := r.data(t)
119+
expected.Error = tc.expectedResourceError
87120
expected.NeedsRecreate = tc.expectedResourceNeedsRecreate
88121

89122
event := &resource.Event{

server/internal/resource/resource.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ type ResourceData struct {
4343
DiffIgnore []string `json:"diff_ignore"`
4444
ResourceVersion string `json:"resource_version"`
4545
PendingDeletion bool `json:"pending_deletion"`
46+
Error string `json:"error"`
4647
}
4748

4849
func (r *ResourceData) Diff(other *ResourceData) (jsondiff.Patch, error) {
@@ -70,6 +71,7 @@ func (r *ResourceData) Clone() *ResourceData {
7071
DiffIgnore: slices.Clone(r.DiffIgnore),
7172
ResourceVersion: r.ResourceVersion,
7273
PendingDeletion: r.PendingDeletion,
74+
Error: r.Error,
7375
}
7476
}
7577

server/internal/resource/state.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -297,6 +297,12 @@ func (s *State) planCreates(options PlanOptions, desired *State) (Plan, error) {
297297
Resource: resource,
298298
Reason: EventReasonNeedsRecreate,
299299
}
300+
case currentResource.Error != "":
301+
event = &Event{
302+
Type: EventTypeUpdate,
303+
Resource: resource,
304+
Reason: EventReasonHasError,
305+
}
300306
case options.ForceUpdate:
301307
event = &Event{
302308
Type: EventTypeUpdate,

server/internal/resource/state_test.go

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -422,6 +422,97 @@ func TestState(t *testing.T) {
422422
assert.ElementsMatch(t, expected[i], phase)
423423
}
424424
})
425+
t.Run("error from previous create", func(t *testing.T) {
426+
resource1Data, err := resource.ToResourceData(&testResource{
427+
ID: "test1",
428+
TestDependencies: []resource.Identifier{
429+
testResourceID("test2"),
430+
},
431+
})
432+
require.NoError(t, err)
433+
434+
resource2Data, err := resource.ToResourceData(&testResource{
435+
ID: "test2",
436+
})
437+
require.NoError(t, err)
438+
439+
current := resource.NewState()
440+
desired := resource.NewState()
441+
442+
resource2WithError := resource2Data.Clone()
443+
resource2WithError.NeedsRecreate = true
444+
resource2WithError.Error = "some error"
445+
446+
current.Add(resource2WithError)
447+
desired.Add(resource1Data, resource2Data)
448+
449+
plan, err := current.Plan(resource.PlanOptions{}, desired)
450+
assert.NoError(t, err)
451+
452+
expected := resource.Plan{
453+
{
454+
{
455+
Type: resource.EventTypeCreate,
456+
Resource: resource2Data,
457+
Reason: resource.EventReasonNeedsRecreate,
458+
},
459+
},
460+
{
461+
{
462+
Type: resource.EventTypeCreate,
463+
Resource: resource1Data,
464+
Reason: resource.EventReasonDoesNotExist,
465+
},
466+
},
467+
}
468+
469+
assert.Equal(t, expected, plan)
470+
})
471+
t.Run("error from previous update", func(t *testing.T) {
472+
resource1Data, err := resource.ToResourceData(&testResource{
473+
ID: "test1",
474+
TestDependencies: []resource.Identifier{
475+
testResourceID("test2"),
476+
},
477+
})
478+
require.NoError(t, err)
479+
480+
resource2Data, err := resource.ToResourceData(&testResource{
481+
ID: "test2",
482+
})
483+
require.NoError(t, err)
484+
485+
current := resource.NewState()
486+
desired := resource.NewState()
487+
488+
resource2WithError := resource2Data.Clone()
489+
resource2WithError.Error = "some error"
490+
491+
current.Add(resource1Data, resource2WithError)
492+
desired.Add(resource1Data, resource2Data)
493+
494+
plan, err := current.Plan(resource.PlanOptions{}, desired)
495+
assert.NoError(t, err)
496+
497+
expected := resource.Plan{
498+
{
499+
{
500+
Type: resource.EventTypeUpdate,
501+
Resource: resource2Data,
502+
Reason: resource.EventReasonHasError,
503+
},
504+
},
505+
{
506+
{
507+
Type: resource.EventTypeUpdate,
508+
Resource: resource1Data,
509+
Reason: resource.EventReasonDependencyUpdated,
510+
},
511+
},
512+
}
513+
514+
assert.Equal(t, expected, plan)
515+
})
425516
t.Run("missing create dependency", func(t *testing.T) {
426517
resource1 := &testResource{
427518
ID: "test1",

server/internal/workflows/activities/apply_event.go

Lines changed: 20 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -139,9 +139,17 @@ func (a *Activities) logResourceEvent(
139139

140140
fields["duration_ms"] = duration.Milliseconds()
141141

142-
if applyErr != nil {
142+
var applyErrStr string
143+
switch {
144+
case applyErr != nil:
145+
applyErrStr = applyErr.Error()
146+
case event.Type != resource.EventTypeRefresh:
147+
applyErrStr = event.Resource.Error
148+
}
149+
150+
if applyErrStr != "" {
143151
fields["success"] = false
144-
fields["error"] = applyErr.Error()
152+
fields["error"] = applyErrStr
145153

146154
err := log(task.LogEntry{
147155
Message: fmt.Sprintf("error while %s resource %s", verb, resourceIdentifier),
@@ -153,18 +161,17 @@ func (a *Activities) logResourceEvent(
153161
fmt.Errorf("failed to record event error: %w", err),
154162
)
155163
}
156-
return applyErr
157-
}
158-
159-
fields["success"] = true
164+
} else {
165+
fields["success"] = true
160166

161-
err = log(task.LogEntry{
162-
Message: fmt.Sprintf("finished %s resource %s (took %s)", verb, resourceIdentifier, duration),
163-
Fields: fields,
164-
})
165-
if err != nil {
166-
return fmt.Errorf("failed to record event completion: %w", err)
167+
err = log(task.LogEntry{
168+
Message: fmt.Sprintf("finished %s resource %s (took %s)", verb, resourceIdentifier, duration),
169+
Fields: fields,
170+
})
171+
if err != nil {
172+
return fmt.Errorf("failed to record event completion: %w", err)
173+
}
167174
}
168175

169-
return nil
176+
return applyErr
170177
}

server/internal/workflows/common.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,13 @@ func (w *Workflows) applyEvents(
7575
event := phase[i]
7676
errs = append(errs, fmt.Errorf("failed to apply %s event from %s to state: %w", event.Type, event.Resource.Identifier, err))
7777
}
78+
if err := out.Event.ResourceError(); err != nil && out.Event.Type != resource.EventTypeRefresh {
79+
// Returns errors that originated from the resource's lifecycle
80+
// method. They're already formatted with the event type and the
81+
// resource identifier. We still want to apply the event to the
82+
// state to record partial creates/updates.
83+
errs = append(errs, err)
84+
}
7885
}
7986
if err := errors.Join(errs...); err != nil {
8087
return fmt.Errorf("failed while modifying resources: %w", err)

0 commit comments

Comments
 (0)