Skip to content

Commit 9d9c1f2

Browse files
pleshakovsjberman
andauthored
Add telemetry job (#1448)
Problem: We want to have a telemetry job that periodically reports product telemetry every 24h. For now, telemetry data is empty and report is sent to the debug log. Solution: - Refactor leader election to use controller-runtime manager capabilities. This simplifies the existing code and make it easier to add a telemetry Job. - Add a telemetry Job that periodically reports empty telemetry to the debug log. - Make the period configurable at build time via TELEMETRY_REPORT_PERIOD Makefile variable. Note: leader elector refactoring changes behavior of NGF process when leadership gets lost: Before: the Manager would shutdown waiting for the runnables to exit. After: the Manager doesn't wait. It similar to NGF process panicing. This should be OK, as NGF container will restart and recover any potentially broken state (update not fully populated statuses, restore correct NGINX configuration). Testing: - Unit tests - Manual testing: - Ensure leader election works as expected - both leader and non-pods run successfully. - Ensure NGF container exits when stop being leader. - Ensure an upgrade from Release 1.1.0 is successful for leader election - the leader gets elected among the new pods. - Ensure the telemetry Job reports telemetry multiple times, using a small value of ELEMETRY_REPORT_PERIOD CLOSES #1382 Co-authored-by: Saylor Berman <s.berman@f5.com>
1 parent ca6c2ff commit 9d9c1f2

File tree

18 files changed

+479
-128
lines changed

18 files changed

+479
-128
lines changed

.goreleaser.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@ builds:
1414
- all=-trimpath={{.Env.GOPATH}}
1515
asmflags:
1616
- all=-trimpath={{.Env.GOPATH}}
17+
ldflags:
18+
- -s -w -X main.version={{.Version}} -X main.commit={{.Commit}} -X main.date={{.Date}} -X main.telemetryReportPeriod=24h
1719
main: ./cmd/gateway/
1820
binary: gateway
1921

.yamllint.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ rules:
4343
deploy/manifests/nginx-gateway.yaml
4444
deploy/manifests/crds
4545
tests/longevity/manifests/cronjob.yaml
46+
.goreleaser.yml
4647
new-line-at-end-of-file: enable
4748
new-lines: enable
4849
octal-values: disable

Makefile

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,12 @@ NGINX_CONF_DIR = internal/mode/static/nginx/conf
88
NJS_DIR = internal/mode/static/nginx/modules/src
99
NGINX_DOCKER_BUILD_PLUS_ARGS = --secret id=nginx-repo.crt,src=nginx-repo.crt --secret id=nginx-repo.key,src=nginx-repo.key
1010
BUILD_AGENT=local
11+
TELEMETRY_REPORT_PERIOD = 24h # also configured in goreleaser.yml
1112
GW_API_VERSION = 1.0.0
1213
INSTALL_WEBHOOK = false
1314

1415
# go build flags - should not be overridden by the user
15-
GO_LINKER_FlAGS_VARS = -X main.version=${VERSION} -X main.commit=${GIT_COMMIT} -X main.date=${DATE}
16+
GO_LINKER_FlAGS_VARS = -X main.version=${VERSION} -X main.commit=${GIT_COMMIT} -X main.date=${DATE} -X main.telemetryReportPeriod=${TELEMETRY_REPORT_PERIOD}
1617
GO_LINKER_FLAGS_OPTIMIZATIONS = -s -w
1718
GO_LINKER_FLAGS = $(GO_LINKER_FLAGS_OPTIMIZATIONS) $(GO_LINKER_FlAGS_VARS)
1819

cmd/gateway/commands.go

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,11 @@ func createStaticModeCommand() *cobra.Command {
131131
return errors.New("POD_NAME environment variable must be set")
132132
}
133133

134+
period, err := time.ParseDuration(telemetryReportPeriod)
135+
if err != nil {
136+
return fmt.Errorf("error parsing telemetry report period: %w", err)
137+
}
138+
134139
var gwNsName *types.NamespacedName
135140
if cmd.Flags().Changed(gatewayFlag) {
136141
gwNsName = &gateway.value
@@ -163,7 +168,8 @@ func createStaticModeCommand() *cobra.Command {
163168
LockName: leaderElectionLockName.String(),
164169
Identity: podName,
165170
},
166-
Plus: plus,
171+
Plus: plus,
172+
TelemetryReportPeriod: period,
167173
}
168174

169175
if err := static.StartManager(conf); err != nil {

cmd/gateway/main.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,14 @@ import (
55
"os"
66
)
77

8+
// Set during go build
89
var (
9-
// Set during go build
1010
version string
1111
commit string
1212
date string
13+
14+
// telemetryReportPeriod is the period at which telemetry reports are sent.
15+
telemetryReportPeriod string
1316
)
1417

1518
func main() {

internal/framework/runnables/doc.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
/*
2+
Package runnables provides helper types for creating runnables for the controller-runtime manager when
3+
leader election is enabled.
4+
*/
5+
package runnables
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
package runnables
2+
3+
import (
4+
"context"
5+
6+
"sigs.k8s.io/controller-runtime/pkg/manager"
7+
)
8+
9+
// Leader is a Runnable that needs to be run only when the current instance is the leader.
10+
type Leader struct {
11+
manager.Runnable
12+
}
13+
14+
var (
15+
_ manager.LeaderElectionRunnable = &Leader{}
16+
_ manager.Runnable = &Leader{}
17+
)
18+
19+
func (r *Leader) NeedLeaderElection() bool {
20+
return true
21+
}
22+
23+
// LeaderOrNonLeader is a Runnable that needs to be run regardless of whether the current instance is the leader.
24+
type LeaderOrNonLeader struct {
25+
manager.Runnable
26+
}
27+
28+
var (
29+
_ manager.LeaderElectionRunnable = &LeaderOrNonLeader{}
30+
_ manager.Runnable = &LeaderOrNonLeader{}
31+
)
32+
33+
func (r *LeaderOrNonLeader) NeedLeaderElection() bool {
34+
return false
35+
}
36+
37+
// EnableAfterBecameLeader is a Runnable that will call the enable function when the current instance becomes
38+
// the leader.
39+
type EnableAfterBecameLeader struct {
40+
enable func(context.Context)
41+
}
42+
43+
var (
44+
_ manager.LeaderElectionRunnable = &EnableAfterBecameLeader{}
45+
_ manager.Runnable = &EnableAfterBecameLeader{}
46+
)
47+
48+
// NewEnableAfterBecameLeader creates a new EnableAfterBecameLeader Runnable.
49+
func NewEnableAfterBecameLeader(enable func(context.Context)) *EnableAfterBecameLeader {
50+
return &EnableAfterBecameLeader{
51+
enable: enable,
52+
}
53+
}
54+
55+
func (j *EnableAfterBecameLeader) Start(ctx context.Context) error {
56+
j.enable(ctx)
57+
return nil
58+
}
59+
60+
func (j *EnableAfterBecameLeader) NeedLeaderElection() bool {
61+
return true
62+
}
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
package runnables
2+
3+
import (
4+
"context"
5+
"testing"
6+
7+
. "github.com/onsi/gomega"
8+
)
9+
10+
func TestLeader(t *testing.T) {
11+
leader := &Leader{}
12+
13+
g := NewWithT(t)
14+
g.Expect(leader.NeedLeaderElection()).To(BeTrue())
15+
}
16+
17+
func TestLeaderOrNonLeader(t *testing.T) {
18+
leaderOrNonLeader := &LeaderOrNonLeader{}
19+
20+
g := NewWithT(t)
21+
g.Expect(leaderOrNonLeader.NeedLeaderElection()).To(BeFalse())
22+
}
23+
24+
func TestEnableAfterBecameLeader(t *testing.T) {
25+
enabled := false
26+
enableAfterBecameLeader := NewEnableAfterBecameLeader(func(_ context.Context) {
27+
enabled = true
28+
})
29+
30+
g := NewWithT(t)
31+
g.Expect(enableAfterBecameLeader.NeedLeaderElection()).To(BeTrue())
32+
g.Expect(enabled).To(BeFalse())
33+
34+
err := enableAfterBecameLeader.Start(context.Background())
35+
g.Expect(err).To(BeNil())
36+
37+
g.Expect(enabled).To(BeTrue())
38+
}

internal/mode/static/config/config.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
package config
22

33
import (
4+
"time"
5+
46
"github.com/go-logr/logr"
57
"go.uber.org/zap"
68
"k8s.io/apimachinery/pkg/types"
@@ -32,6 +34,8 @@ type Config struct {
3234
MetricsConfig MetricsConfig
3335
// HealthConfig specifies the health probe config.
3436
HealthConfig HealthConfig
37+
// TelemetryReportPeriod is the period at which telemetry reports are sent.
38+
TelemetryReportPeriod time.Duration
3539
}
3640

3741
// GatewayPodConfig contains information about this Pod.

internal/mode/static/leader.go

Lines changed: 0 additions & 102 deletions
This file was deleted.

internal/mode/static/manager.go

Lines changed: 33 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import (
2020
"k8s.io/client-go/tools/record"
2121
ctlr "sigs.k8s.io/controller-runtime"
2222
"sigs.k8s.io/controller-runtime/pkg/client"
23+
ctrlcfg "sigs.k8s.io/controller-runtime/pkg/config"
2324
"sigs.k8s.io/controller-runtime/pkg/manager"
2425
"sigs.k8s.io/controller-runtime/pkg/metrics"
2526
metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"
@@ -34,6 +35,8 @@ import (
3435
"github.com/nginxinc/nginx-gateway-fabric/internal/framework/controller/predicate"
3536
"github.com/nginxinc/nginx-gateway-fabric/internal/framework/events"
3637
"github.com/nginxinc/nginx-gateway-fabric/internal/framework/gatewayclass"
38+
"github.com/nginxinc/nginx-gateway-fabric/internal/framework/helpers"
39+
"github.com/nginxinc/nginx-gateway-fabric/internal/framework/runnables"
3740
"github.com/nginxinc/nginx-gateway-fabric/internal/framework/status"
3841
"github.com/nginxinc/nginx-gateway-fabric/internal/mode/static/config"
3942
"github.com/nginxinc/nginx-gateway-fabric/internal/mode/static/metrics/collectors"
@@ -45,6 +48,7 @@ import (
4548
"github.com/nginxinc/nginx-gateway-fabric/internal/mode/static/state/relationship"
4649
"github.com/nginxinc/nginx-gateway-fabric/internal/mode/static/state/resolver"
4750
"github.com/nginxinc/nginx-gateway-fabric/internal/mode/static/state/validation"
51+
"github.com/nginxinc/nginx-gateway-fabric/internal/mode/static/telemetry"
4852
)
4953

5054
const (
@@ -69,6 +73,21 @@ func StartManager(cfg config.Config) error {
6973
Scheme: scheme,
7074
Logger: cfg.Logger,
7175
Metrics: getMetricsOptions(cfg.MetricsConfig),
76+
// Note: when the leadership is lost, the manager will return an error in the Start() method.
77+
// However, it will not wait for any Runnable it starts to finish, meaning any in-progress operations
78+
// might get terminated half-way.
79+
LeaderElection: true,
80+
LeaderElectionNamespace: cfg.GatewayPodConfig.Namespace,
81+
LeaderElectionID: cfg.LeaderElection.LockName,
82+
// We're not enabling LeaderElectionReleaseOnCancel because when the Manager stops gracefully, it waits
83+
// for all started Runnables (including Leader-only ones) to finish. Otherwise, the new leader might start
84+
// running Leader-only Runnables before the old leader has finished running them.
85+
// See the doc comment for the LeaderElectionReleaseOnCancel for more details.
86+
LeaderElectionReleaseOnCancel: false,
87+
Controller: ctrlcfg.Controller{
88+
// All of our controllers still need to work in case of non-leader pods
89+
NeedLeaderElection: helpers.GetPointer(false),
90+
},
7291
}
7392

7493
if cfg.HealthConfig.Enabled {
@@ -211,35 +230,26 @@ func StartManager(cfg config.Config) error {
211230
firstBatchPreparer,
212231
)
213232

214-
if err = mgr.Add(eventLoop); err != nil {
233+
if err = mgr.Add(&runnables.LeaderOrNonLeader{Runnable: eventLoop}); err != nil {
215234
return fmt.Errorf("cannot register event loop: %w", err)
216235
}
217236

218-
leaderElectorLogger := cfg.Logger.WithName("leaderElector")
237+
if err = mgr.Add(runnables.NewEnableAfterBecameLeader(statusUpdater.Enable)); err != nil {
238+
return fmt.Errorf("cannot register status updater: %w", err)
239+
}
219240

220-
if cfg.LeaderElection.Enabled {
221-
leaderElector, err := newLeaderElectorRunnable(leaderElectorRunnableConfig{
222-
kubeConfig: clusterCfg,
223-
recorder: recorder,
224-
onStartedLeading: func(ctx context.Context) {
225-
leaderElectorLogger.Info("Started leading")
226-
statusUpdater.Enable(ctx)
241+
telemetryJob := &runnables.Leader{
242+
Runnable: telemetry.NewJob(
243+
telemetry.JobConfig{
244+
Exporter: telemetry.NewLoggingExporter(cfg.Logger.WithName("telemetryExporter").V(1 /* debug */)),
245+
Logger: cfg.Logger.WithName("telemetryJob"),
246+
Period: cfg.TelemetryReportPeriod,
227247
},
228-
onStoppedLeading: func() {
229-
leaderElectorLogger.Info("Stopped leading")
230-
statusUpdater.Disable()
231-
},
232-
lockNs: cfg.GatewayPodConfig.Namespace,
233-
lockName: cfg.LeaderElection.LockName,
234-
identity: cfg.LeaderElection.Identity,
235-
})
236-
if err != nil {
237-
return err
238-
}
248+
),
249+
}
239250

240-
if err = mgr.Add(leaderElector); err != nil {
241-
return fmt.Errorf("cannot register leader elector: %w", err)
242-
}
251+
if err = mgr.Add(telemetryJob); err != nil {
252+
return fmt.Errorf("cannot register telemetry job: %w", err)
243253
}
244254

245255
cfg.Logger.Info("Starting manager")

internal/mode/static/telemetry/doc.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
/*
2+
Package telemetry is responsible for collecting and sending product telemetry data.
3+
*/
4+
package telemetry

0 commit comments

Comments
 (0)