Skip to content

Commit 8e8a2bf

Browse files
authored
Implement Status Updater Retrying on Failures (#1062)
Implement retries on status update failure. Problem: NGF will not retry on status update failure, thus there is a chance that some resources will not have up-to-do statuses. Solution: Add retry logic when status update fails with a small exponential backoff after each retry. Also, added logic to allow for a graceful exit of the status updater when the NGF pod context is cancelled.
1 parent 8e57fe8 commit 8e8a2bf

File tree

5 files changed

+343
-54
lines changed

5 files changed

+343
-54
lines changed
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
package status
2+
3+
import (
4+
"context"
5+
6+
"sigs.k8s.io/controller-runtime/pkg/client"
7+
)
8+
9+
//go:generate go run github.com/maxbrunsfeld/counterfeiter/v6 . K8sUpdater
10+
11+
// K8sUpdater updates a resource from the k8s API.
12+
// It allows us to mock the client.Reader.Status.Update method.
13+
type K8sUpdater interface {
14+
// Update is from client.StatusClient.SubResourceWriter.
15+
Update(ctx context.Context, obj client.Object, opts ...client.SubResourceUpdateOption) error
16+
}

internal/framework/status/statusfakes/fake_k8s_updater.go

Lines changed: 117 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

internal/framework/status/updater.go

Lines changed: 83 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,20 @@ package status
22

33
import (
44
"context"
5+
"errors"
56
"fmt"
67
"sync"
8+
"time"
79

810
"github.com/go-logr/logr"
911
apierrors "k8s.io/apimachinery/pkg/api/errors"
1012
"k8s.io/apimachinery/pkg/types"
13+
"k8s.io/apimachinery/pkg/util/wait"
1114
"sigs.k8s.io/controller-runtime/pkg/client"
1215
"sigs.k8s.io/gateway-api/apis/v1beta1"
1316

1417
ngfAPI "github.com/nginxinc/nginx-gateway-fabric/apis/v1alpha1"
18+
"github.com/nginxinc/nginx-gateway-fabric/internal/framework/controller"
1519
)
1620

1721
//go:generate go run github.com/maxbrunsfeld/counterfeiter/v6 . Updater
@@ -64,15 +68,11 @@ type UpdaterConfig struct {
6468
// (b) k8s API can become slow or even timeout. This will increase every update status API call.
6569
// Making UpdaterImpl asynchronous will prevent it from adding variable delays to the event loop.
6670
//
67-
// (3) It doesn't retry on failures. This means there is a chance that some resources will not have up-to-do statuses.
68-
// Statuses are important part of the Gateway API, so we need to ensure that the Gateway always keep the resources
69-
// statuses up-to-date.
70-
//
71-
// (4) It doesn't clear the statuses of a resources that are no longer handled by the Gateway. For example, if
71+
// (3) It doesn't clear the statuses of a resources that are no longer handled by the Gateway. For example, if
7272
// an HTTPRoute resource no longer has the parentRef to the Gateway resources, the Gateway must update the status
7373
// of the resource to remove the status about the removed parentRef.
7474
//
75-
// (5) If another controllers changes the status of the Gateway/HTTPRoute resource so that the information set by our
75+
// (4) If another controllers changes the status of the Gateway/HTTPRoute resource so that the information set by our
7676
// Gateway is removed, our Gateway will not restore the status until the EventLoop invokes the StatusUpdater as a
7777
// result of processing some other new change to a resource(s).
7878
// FIXME(pleshakov): Make updater production ready
@@ -179,6 +179,11 @@ func (upd *UpdaterImpl) updateGatewayAPI(ctx context.Context, statuses GatewayAP
179179

180180
if upd.cfg.UpdateGatewayClassStatus {
181181
for nsname, gcs := range statuses.GatewayClassStatuses {
182+
select {
183+
case <-ctx.Done():
184+
return
185+
default:
186+
}
182187
upd.writeStatuses(ctx, nsname, &v1beta1.GatewayClass{}, func(object client.Object) {
183188
gc := object.(*v1beta1.GatewayClass)
184189
gc.Status = prepareGatewayClassStatus(gcs, upd.cfg.Clock.Now())
@@ -188,6 +193,11 @@ func (upd *UpdaterImpl) updateGatewayAPI(ctx context.Context, statuses GatewayAP
188193
}
189194

190195
for nsname, gs := range statuses.GatewayStatuses {
196+
select {
197+
case <-ctx.Done():
198+
return
199+
default:
200+
}
191201
upd.writeStatuses(ctx, nsname, &v1beta1.Gateway{}, func(object client.Object) {
192202
gw := object.(*v1beta1.Gateway)
193203
gw.Status = prepareGatewayStatus(gs, upd.cfg.PodIP, upd.cfg.Clock.Now())
@@ -200,7 +210,6 @@ func (upd *UpdaterImpl) updateGatewayAPI(ctx context.Context, statuses GatewayAP
200210
return
201211
default:
202212
}
203-
204213
upd.writeStatuses(ctx, nsname, &v1beta1.HTTPRoute{}, func(object client.Object) {
205214
hr := object.(*v1beta1.HTTPRoute)
206215
// statuses.GatewayStatus is never nil when len(statuses.HTTPRouteStatuses) > 0
@@ -219,26 +228,19 @@ func (upd *UpdaterImpl) writeStatuses(
219228
obj client.Object,
220229
statusSetter func(client.Object),
221230
) {
222-
// The function handles errors by reporting them in the logs.
223-
// We need to get the latest version of the resource.
224-
// Otherwise, the Update status API call can fail.
225-
// Note: the default client uses a cache for reads, so we're not making an unnecessary API call here.
226-
// the default is configurable in the Manager options.
227-
if err := upd.cfg.Client.Get(ctx, nsname, obj); err != nil {
228-
if !apierrors.IsNotFound(err) {
229-
upd.cfg.Logger.Error(
230-
err,
231-
"Failed to get the recent version the resource when updating status",
232-
"namespace", nsname.Namespace,
233-
"name", nsname.Name,
234-
"kind", obj.GetObjectKind().GroupVersionKind().Kind)
235-
}
236-
return
237-
}
238-
239-
statusSetter(obj)
240-
241-
if err := upd.cfg.Client.Status().Update(ctx, obj); err != nil {
231+
err := wait.ExponentialBackoffWithContext(
232+
ctx,
233+
wait.Backoff{
234+
Duration: time.Millisecond * 200,
235+
Factor: 2,
236+
Jitter: 0.5,
237+
Steps: 4,
238+
Cap: time.Millisecond * 3000,
239+
},
240+
// Function returns true if the condition is satisfied, or an error if the loop should be aborted.
241+
NewRetryUpdateFunc(upd.cfg.Client, upd.cfg.Client.Status(), nsname, obj, upd.cfg.Logger, statusSetter),
242+
)
243+
if err != nil && !errors.Is(err, context.Canceled) {
242244
upd.cfg.Logger.Error(
243245
err,
244246
"Failed to update status",
@@ -247,3 +249,57 @@ func (upd *UpdaterImpl) writeStatuses(
247249
"kind", obj.GetObjectKind().GroupVersionKind().Kind)
248250
}
249251
}
252+
253+
// NewRetryUpdateFunc returns a function which will be used in wait.ExponentialBackoffWithContext.
254+
// The function will attempt to Update a kubernetes resource and will be retried in
255+
// wait.ExponentialBackoffWithContext if an error occurs. Exported for testing purposes.
256+
//
257+
// wait.ExponentialBackoffWithContext will retry if this function returns nil as its error,
258+
// which is what we want if we encounter an error from the functions we call. However,
259+
// the linter will complain if we return nil if an error was found.
260+
//
261+
//nolint:nilerr
262+
func NewRetryUpdateFunc(
263+
getter controller.Getter,
264+
updater K8sUpdater,
265+
nsname types.NamespacedName,
266+
obj client.Object,
267+
logger logr.Logger,
268+
statusSetter func(client.Object),
269+
) func(ctx context.Context) (bool, error) {
270+
return func(ctx context.Context) (bool, error) {
271+
// The function handles errors by reporting them in the logs.
272+
// We need to get the latest version of the resource.
273+
// Otherwise, the Update status API call can fail.
274+
// Note: the default client uses a cache for reads, so we're not making an unnecessary API call here.
275+
// the default is configurable in the Manager options.
276+
if err := getter.Get(ctx, nsname, obj); err != nil {
277+
// apierrors.IsNotFound(err) can happen when the resource is deleted,
278+
// so no need to retry or return an error.
279+
if apierrors.IsNotFound(err) {
280+
return true, nil
281+
}
282+
logger.V(1).Info(
283+
"Encountered error when getting resource to update status",
284+
"error", err,
285+
"namespace", nsname.Namespace,
286+
"name", nsname.Name,
287+
"kind", obj.GetObjectKind().GroupVersionKind().Kind)
288+
return false, nil
289+
}
290+
291+
statusSetter(obj)
292+
293+
if err := updater.Update(ctx, obj); err != nil {
294+
logger.V(1).Info(
295+
"Encountered error updating status",
296+
"error", err,
297+
"namespace", nsname.Namespace,
298+
"name", nsname.Name,
299+
"kind", obj.GetObjectKind().GroupVersionKind().Kind)
300+
return false, nil
301+
}
302+
303+
return true, nil
304+
}
305+
}
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
package status_test
2+
3+
import (
4+
"context"
5+
"errors"
6+
"testing"
7+
8+
. "github.com/onsi/gomega"
9+
apierrors "k8s.io/apimachinery/pkg/api/errors"
10+
"k8s.io/apimachinery/pkg/runtime/schema"
11+
"k8s.io/apimachinery/pkg/types"
12+
"sigs.k8s.io/controller-runtime/pkg/client"
13+
"sigs.k8s.io/controller-runtime/pkg/log/zap"
14+
"sigs.k8s.io/gateway-api/apis/v1beta1"
15+
16+
"github.com/nginxinc/nginx-gateway-fabric/internal/framework/controller/controllerfakes"
17+
"github.com/nginxinc/nginx-gateway-fabric/internal/framework/status"
18+
"github.com/nginxinc/nginx-gateway-fabric/internal/framework/status/statusfakes"
19+
)
20+
21+
func TestNewRetryUpdateFunc(t *testing.T) {
22+
tests := []struct {
23+
getReturns error
24+
updateReturns error
25+
name string
26+
expConditionPassed bool
27+
}{
28+
{
29+
getReturns: errors.New("failed to get resource"),
30+
updateReturns: nil,
31+
name: "get fails",
32+
expConditionPassed: false,
33+
},
34+
{
35+
getReturns: apierrors.NewNotFound(schema.GroupResource{}, "not found"),
36+
updateReturns: nil,
37+
name: "get fails and apierrors is not found",
38+
expConditionPassed: true,
39+
},
40+
{
41+
getReturns: nil,
42+
updateReturns: errors.New("failed to update resource"),
43+
name: "update fails",
44+
expConditionPassed: false,
45+
},
46+
{
47+
getReturns: nil,
48+
updateReturns: nil,
49+
name: "nothing fails",
50+
expConditionPassed: true,
51+
},
52+
}
53+
54+
fakeStatusUpdater := &statusfakes.FakeK8sUpdater{}
55+
fakeGetter := &controllerfakes.FakeGetter{}
56+
for _, test := range tests {
57+
t.Run(test.name, func(t *testing.T) {
58+
g := NewWithT(t)
59+
fakeStatusUpdater.UpdateReturns(test.updateReturns)
60+
fakeGetter.GetReturns(test.getReturns)
61+
f := status.NewRetryUpdateFunc(
62+
fakeGetter,
63+
fakeStatusUpdater,
64+
types.NamespacedName{},
65+
&v1beta1.GatewayClass{},
66+
zap.New(),
67+
func(client.Object) {})
68+
conditionPassed, err := f(context.Background())
69+
70+
// The function should always return nil.
71+
g.Expect(err).ToNot(HaveOccurred())
72+
g.Expect(conditionPassed).To(Equal(test.expConditionPassed))
73+
})
74+
}
75+
}

0 commit comments

Comments
 (0)