From 8f2a11347dbae60290671ae4a037a5730cbb89f0 Mon Sep 17 00:00:00 2001 From: Ciara Stacke Date: Tue, 12 Sep 2023 12:36:41 +0100 Subject: [PATCH 1/3] Add NGINX reload counters --- docs/monitoring.md | 8 ++ internal/mode/static/manager.go | 42 +++--- internal/mode/static/metrics/collector.go | 121 ++++++++++++++++++ internal/mode/static/metrics/metrics.go | 4 + internal/mode/static/metrics/nginx.go | 2 +- internal/mode/static/nginx/runtime/manager.go | 25 +++- 6 files changed, 182 insertions(+), 20 deletions(-) create mode 100644 internal/mode/static/metrics/collector.go create mode 100644 internal/mode/static/metrics/metrics.go diff --git a/docs/monitoring.md b/docs/monitoring.md index 5e5862157d..238444148f 100644 --- a/docs/monitoring.md +++ b/docs/monitoring.md @@ -86,6 +86,14 @@ NGINX Kubernetes Gateway exports the following metrics: - These metrics have the namespace `nginx_kubernetes_gateway`, and include the label `class` which is set to the Gateway class of NKG. For example, `nginx_kubernetes_gateway_connections_accepted{class="nginx"}`. +- NGINX Kubernetes Gateway metrics: + - nginx_reloads_total. Number of successful NGINX reloads. + - nginx_reload_errors_total. Number of unsuccessful NGINX reloads. + - nginx_last_reload_status. Status of the last NGINX reload, 0 meaning down and 1 up. + - nginx_last_reload_milliseconds. Duration in milliseconds of the last NGINX reload. + - These metrics have the namespace `nginx_kubernetes_gateway`, and include the label `class` which is set to the + Gateway class of NKG. For example, `nginx_kubernetes_gateway_nginx_reloads_total{class="nginx"}`. + - [controller-runtime](https://github.com/kubernetes-sigs/controller-runtime) metrics. These include: - Total number of reconciliation errors per controller - Length of reconcile queue per controller diff --git a/internal/mode/static/manager.go b/internal/mode/static/manager.go index a140f581d9..626c961866 100644 --- a/internal/mode/static/manager.go +++ b/internal/mode/static/manager.go @@ -125,6 +125,20 @@ func StartManager(cfg config.Config) error { return fmt.Errorf("cannot clear NGINX configuration folders: %w", err) } + // Ensure NGINX is running before registering metrics & starting the manager. + if err := ngxruntime.EnsureNginxRunning(ctx); err != nil { + return fmt.Errorf("NGINX is not running: %w", err) + } + + var mgrCollector nkgmetrics.ManagerCollector + mgrCollector = nkgmetrics.NewManagerFakeCollector() + if cfg.MetricsConfig.Enabled { + mgrCollector, err = configureNginxMetrics(cfg.GatewayClassName) + if err != nil { + return err + } + } + statusUpdater := status.NewUpdater(status.UpdaterConfig{ GatewayCtlrName: cfg.GatewayCtlrName, GatewayClassName: cfg.GatewayClassName, @@ -146,7 +160,7 @@ func StartManager(cfg config.Config) error { cfg.Logger.WithName("nginxFileManager"), file.NewStdLibOSFileManager(), ), - nginxRuntimeMgr: ngxruntime.NewManagerImpl(), + nginxRuntimeMgr: ngxruntime.NewManagerImpl(mgrCollector), statusUpdater: statusUpdater, eventRecorder: recorder, healthChecker: hc, @@ -193,17 +207,6 @@ func StartManager(cfg config.Config) error { } } - // Ensure NGINX is running before registering metrics & starting the manager. - if err := ngxruntime.EnsureNginxRunning(ctx); err != nil { - return fmt.Errorf("NGINX is not running: %w", err) - } - - if cfg.MetricsConfig.Enabled { - if err := configureNginxMetrics(cfg.GatewayClassName); err != nil { - return err - } - } - cfg.Logger.Info("Starting manager") return mgr.Start(ctx) } @@ -353,13 +356,22 @@ func setInitialConfig( return updateControlPlane(&config, logger, eventRecorder, configName, logLevelSetter) } -func configureNginxMetrics(gatewayClassName string) error { +func configureNginxMetrics(gatewayClassName string) (*nkgmetrics.ManagerMetricsCollector, error) { constLabels := map[string]string{"class": gatewayClassName} ngxCollector, err := nkgmetrics.NewNginxMetricsCollector(constLabels) if err != nil { - return fmt.Errorf("cannot get NGINX metrics: %w", err) + return nil, fmt.Errorf("cannot get NGINX metrics: %w", err) + } + mgrCollector := nkgmetrics.NewManagerMetricsCollector(constLabels) + if err = metrics.Registry.Register(mgrCollector); err != nil { + return nil, fmt.Errorf("failed to register manager metrics collector: %w", err) } - return metrics.Registry.Register(ngxCollector) + + if err = metrics.Registry.Register(ngxCollector); err != nil { + return nil, fmt.Errorf("failed to register NGINX metrics collector: %w", err) + } + + return mgrCollector, nil } func getMetricsOptions(cfg config.MetricsConfig) metricsserver.Options { diff --git a/internal/mode/static/metrics/collector.go b/internal/mode/static/metrics/collector.go new file mode 100644 index 0000000000..35d16dd419 --- /dev/null +++ b/internal/mode/static/metrics/collector.go @@ -0,0 +1,121 @@ +package metrics + +import ( + "time" + + "github.com/prometheus/client_golang/prometheus" +) + +// ManagerCollector is an interface for the metrics of the Nginx runtime manager +type ManagerCollector interface { + IncNginxReloadCount() + IncNginxReloadErrors() + UpdateLastReloadTime(ms time.Duration) +} + +// ManagerMetricsCollector implements ManagerCollector interface and prometheus.Collector interface +type ManagerMetricsCollector struct { + // Metrics + reloadsTotal prometheus.Counter + reloadsError prometheus.Counter + lastReloadStatus prometheus.Gauge + lastReloadTime prometheus.Gauge +} + +// NewManagerMetricsCollector creates a new ManagerMetricsCollector +func NewManagerMetricsCollector(constLabels map[string]string) *ManagerMetricsCollector { + nc := &ManagerMetricsCollector{ + reloadsTotal: prometheus.NewCounter( + prometheus.CounterOpts{ + Name: "nginx_reloads_total", + Namespace: metricsNamespace, + Help: "Number of successful NGINX reloads", + ConstLabels: constLabels, + }), + reloadsError: prometheus.NewCounter( + prometheus.CounterOpts{ + Name: "nginx_reload_errors_total", + Namespace: metricsNamespace, + Help: "Number of unsuccessful NGINX reloads", + ConstLabels: constLabels, + }, + ), + lastReloadStatus: prometheus.NewGauge( + prometheus.GaugeOpts{ + Name: "nginx_last_reload_status", + Namespace: metricsNamespace, + Help: "Status of the last NGINX reload", + ConstLabels: constLabels, + }, + ), + lastReloadTime: prometheus.NewGauge( + prometheus.GaugeOpts{ + Name: "nginx_last_reload_milliseconds", + Namespace: metricsNamespace, + Help: "Duration in milliseconds of the last NGINX reload", + ConstLabels: constLabels, + }, + ), + } + return nc +} + +// IncNginxReloadCount increments the counter of successful NGINX reloads and sets the last reload status to true. +func (mc *ManagerMetricsCollector) IncNginxReloadCount() { + mc.reloadsTotal.Inc() + mc.updateLastReloadStatus(true) +} + +// IncNginxReloadErrors increments the counter of NGINX reload errors and sets the last reload status to false. +func (mc *ManagerMetricsCollector) IncNginxReloadErrors() { + mc.reloadsError.Inc() + mc.updateLastReloadStatus(false) +} + +// updateLastReloadStatus updates the last NGINX reload status metric. +func (mc *ManagerMetricsCollector) updateLastReloadStatus(up bool) { + var status float64 + if up { + status = 1.0 + } + mc.lastReloadStatus.Set(status) +} + +// UpdateLastReloadTime updates the last NGINX reload time. +func (mc *ManagerMetricsCollector) UpdateLastReloadTime(duration time.Duration) { + mc.lastReloadTime.Set(float64(duration / time.Millisecond)) +} + +// Describe implements prometheus.Collector interface Describe method. +func (mc *ManagerMetricsCollector) Describe(ch chan<- *prometheus.Desc) { + mc.reloadsTotal.Describe(ch) + mc.reloadsError.Describe(ch) + mc.lastReloadStatus.Describe(ch) + mc.lastReloadTime.Describe(ch) +} + +// Collect implements the prometheus.Collector interface Collect method. +func (mc *ManagerMetricsCollector) Collect(ch chan<- prometheus.Metric) { + mc.reloadsTotal.Collect(ch) + mc.reloadsError.Collect(ch) + mc.lastReloadStatus.Collect(ch) + mc.lastReloadTime.Collect(ch) +} + +// ManagerFakeCollector is a fake collector that will implement ManagerCollector interface. +// Used to initilise the ManagerCollector when metrics are disabled to avoid nil pointer errors. +type ManagerFakeCollector struct{} + +// NewManagerFakeCollector creates a fake collector that implements ManagerCollector interface. +func NewManagerFakeCollector() *ManagerFakeCollector { + return &ManagerFakeCollector{} +} + +// IncNginxReloadCount implements a fake IncNginxReloadCount. +func (mc *ManagerFakeCollector) IncNginxReloadCount() {} + +// IncNginxReloadErrors implements a fake IncNginxReloadErrors. +func (mc *ManagerFakeCollector) IncNginxReloadErrors() {} + +// UpdateLastReloadTime implements a fake UpdateLastReloadTime. +func (mc *ManagerFakeCollector) UpdateLastReloadTime(_ time.Duration) {} diff --git a/internal/mode/static/metrics/metrics.go b/internal/mode/static/metrics/metrics.go new file mode 100644 index 0000000000..1d9e85e7e6 --- /dev/null +++ b/internal/mode/static/metrics/metrics.go @@ -0,0 +1,4 @@ +package metrics + +// nolint:gosec // flagged as potential hardcoded credentials, but is not sensitive +const metricsNamespace = "nginx_kubernetes_gateway" diff --git a/internal/mode/static/metrics/nginx.go b/internal/mode/static/metrics/nginx.go index 33946722d2..c2eadd265f 100644 --- a/internal/mode/static/metrics/nginx.go +++ b/internal/mode/static/metrics/nginx.go @@ -24,7 +24,7 @@ func NewNginxMetricsCollector(constLabels map[string]string) (prometheus.Collect if err != nil { return nil, err } - return nginxCollector.NewNginxCollector(client, "nginx_kubernetes_gateway", constLabels), nil + return nginxCollector.NewNginxCollector(client, metricsNamespace, constLabels), nil } // getSocketClient gets an http.Client with a unix socket transport. diff --git a/internal/mode/static/nginx/runtime/manager.go b/internal/mode/static/nginx/runtime/manager.go index eb7f48c208..641d572e07 100644 --- a/internal/mode/static/nginx/runtime/manager.go +++ b/internal/mode/static/nginx/runtime/manager.go @@ -13,6 +13,8 @@ import ( "time" "k8s.io/apimachinery/pkg/util/wait" + + nkgmetrics "github.com/nginxinc/nginx-kubernetes-gateway/internal/mode/static/metrics" ) const ( @@ -43,32 +45,38 @@ type Manager interface { // ManagerImpl implements Manager. type ManagerImpl struct { - verifyClient *verifyClient + verifyClient *verifyClient + metricsCollector nkgmetrics.ManagerCollector } // NewManagerImpl creates a new ManagerImpl. -func NewManagerImpl() *ManagerImpl { +func NewManagerImpl(mgrCollector nkgmetrics.ManagerCollector) *ManagerImpl { return &ManagerImpl{ - verifyClient: newVerifyClient(nginxReloadTimeout), + verifyClient: newVerifyClient(nginxReloadTimeout), + metricsCollector: mgrCollector, } } func (m *ManagerImpl) Reload(ctx context.Context, configVersion int) error { + t1 := time.Now() // We find the main NGINX PID on every reload because it will change if the NGINX container is restarted. pid, err := findMainProcess(ctx, os.Stat, os.ReadFile, pidFileTimeout) if err != nil { + m.metricsCollector.IncNginxReloadErrors() return fmt.Errorf("failed to find NGINX main process: %w", err) } childProcFile := fmt.Sprintf(childProcPathFmt, pid) previousChildProcesses, err := os.ReadFile(childProcFile) if err != nil { + m.metricsCollector.IncNginxReloadErrors() return err } // send HUP signal to the NGINX main process reload configuration // See https://nginx.org/en/docs/control.html if err := syscall.Kill(pid, syscall.SIGHUP); err != nil { + m.metricsCollector.IncNginxReloadErrors() return fmt.Errorf("failed to send the HUP signal to NGINX main: %w", err) } @@ -79,10 +87,19 @@ func (m *ManagerImpl) Reload(ctx context.Context, configVersion int) error { os.ReadFile, childProcsTimeout, ); err != nil { + m.metricsCollector.IncNginxReloadErrors() return fmt.Errorf(noNewWorkersErrFmt, configVersion, err) } - return m.verifyClient.waitForCorrectVersion(ctx, configVersion) + if err = m.verifyClient.waitForCorrectVersion(ctx, configVersion); err != nil { + m.metricsCollector.IncNginxReloadErrors() + return err + } + m.metricsCollector.IncNginxReloadCount() + + t2 := time.Now() + m.metricsCollector.UpdateLastReloadTime(t2.Sub(t1)) + return nil } // EnsureNginxRunning ensures NGINX is running by locating the main process. From a34977acf121cc45395497d11eed4d4eb042d768 Mon Sep 17 00:00:00 2001 From: Ciara Stacke Date: Fri, 15 Sep 2023 09:48:35 +0100 Subject: [PATCH 2/3] Move interface, change metric names and types --- docs/monitoring.md | 5 +- internal/mode/static/manager.go | 2 +- internal/mode/static/metrics/collector.go | 58 +++++++++---------- internal/mode/static/nginx/runtime/manager.go | 15 +++-- 4 files changed, 39 insertions(+), 41 deletions(-) diff --git a/docs/monitoring.md b/docs/monitoring.md index 238444148f..261e7b06a5 100644 --- a/docs/monitoring.md +++ b/docs/monitoring.md @@ -89,8 +89,9 @@ NGINX Kubernetes Gateway exports the following metrics: - NGINX Kubernetes Gateway metrics: - nginx_reloads_total. Number of successful NGINX reloads. - nginx_reload_errors_total. Number of unsuccessful NGINX reloads. - - nginx_last_reload_status. Status of the last NGINX reload, 0 meaning down and 1 up. - - nginx_last_reload_milliseconds. Duration in milliseconds of the last NGINX reload. + - nginx_stale_config. 1 means NKG failed to configure NGINX with the latest version of the configuration, which means + NGINX is running with a stale version. + - nginx_last_reload_milliseconds. Duration in milliseconds of NGINX reloads. - These metrics have the namespace `nginx_kubernetes_gateway`, and include the label `class` which is set to the Gateway class of NKG. For example, `nginx_kubernetes_gateway_nginx_reloads_total{class="nginx"}`. diff --git a/internal/mode/static/manager.go b/internal/mode/static/manager.go index 626c961866..3876a40823 100644 --- a/internal/mode/static/manager.go +++ b/internal/mode/static/manager.go @@ -130,7 +130,7 @@ func StartManager(cfg config.Config) error { return fmt.Errorf("NGINX is not running: %w", err) } - var mgrCollector nkgmetrics.ManagerCollector + var mgrCollector ngxruntime.ManagerCollector mgrCollector = nkgmetrics.NewManagerFakeCollector() if cfg.MetricsConfig.Enabled { mgrCollector, err = configureNginxMetrics(cfg.GatewayClassName) diff --git a/internal/mode/static/metrics/collector.go b/internal/mode/static/metrics/collector.go index 35d16dd419..6ecd171d3b 100644 --- a/internal/mode/static/metrics/collector.go +++ b/internal/mode/static/metrics/collector.go @@ -6,20 +6,13 @@ import ( "github.com/prometheus/client_golang/prometheus" ) -// ManagerCollector is an interface for the metrics of the Nginx runtime manager -type ManagerCollector interface { - IncNginxReloadCount() - IncNginxReloadErrors() - UpdateLastReloadTime(ms time.Duration) -} - // ManagerMetricsCollector implements ManagerCollector interface and prometheus.Collector interface type ManagerMetricsCollector struct { // Metrics - reloadsTotal prometheus.Counter - reloadsError prometheus.Counter - lastReloadStatus prometheus.Gauge - lastReloadTime prometheus.Gauge + reloadsTotal prometheus.Counter + reloadsError prometheus.Counter + configStale prometheus.Gauge + reloadsDuration prometheus.Histogram } // NewManagerMetricsCollector creates a new ManagerMetricsCollector @@ -40,70 +33,71 @@ func NewManagerMetricsCollector(constLabels map[string]string) *ManagerMetricsCo ConstLabels: constLabels, }, ), - lastReloadStatus: prometheus.NewGauge( + configStale: prometheus.NewGauge( prometheus.GaugeOpts{ - Name: "nginx_last_reload_status", + Name: "nginx_stale_config", Namespace: metricsNamespace, - Help: "Status of the last NGINX reload", + Help: "Indicates if NGINX is not serving the latest configuration.", ConstLabels: constLabels, }, ), - lastReloadTime: prometheus.NewGauge( - prometheus.GaugeOpts{ - Name: "nginx_last_reload_milliseconds", + reloadsDuration: prometheus.NewHistogram( + prometheus.HistogramOpts{ + Name: "nginx_reloads_milliseconds", Namespace: metricsNamespace, - Help: "Duration in milliseconds of the last NGINX reload", + Help: "Duration in milliseconds of NGINX reloads", ConstLabels: constLabels, + Buckets: []float64{100.0, 200.0, 300.0, 400.0, 500.0}, }, ), } return nc } -// IncNginxReloadCount increments the counter of successful NGINX reloads and sets the last reload status to true. +// IncNginxReloadCount increments the counter of successful NGINX reloads and sets the stale config status to false. func (mc *ManagerMetricsCollector) IncNginxReloadCount() { mc.reloadsTotal.Inc() - mc.updateLastReloadStatus(true) + mc.updateConfigStaleStatus(false) } -// IncNginxReloadErrors increments the counter of NGINX reload errors and sets the last reload status to false. +// IncNginxReloadErrors increments the counter of NGINX reload errors and sets the stale config status to true. func (mc *ManagerMetricsCollector) IncNginxReloadErrors() { mc.reloadsError.Inc() - mc.updateLastReloadStatus(false) + mc.updateConfigStaleStatus(true) } -// updateLastReloadStatus updates the last NGINX reload status metric. -func (mc *ManagerMetricsCollector) updateLastReloadStatus(up bool) { +// updateConfigStaleStatus updates the last NGINX reload status metric. +func (mc *ManagerMetricsCollector) updateConfigStaleStatus(stale bool) { var status float64 - if up { + if stale { status = 1.0 } - mc.lastReloadStatus.Set(status) + mc.configStale.Set(status) } // UpdateLastReloadTime updates the last NGINX reload time. func (mc *ManagerMetricsCollector) UpdateLastReloadTime(duration time.Duration) { - mc.lastReloadTime.Set(float64(duration / time.Millisecond)) + mc.reloadsDuration.Observe(float64(duration / time.Millisecond)) } // Describe implements prometheus.Collector interface Describe method. func (mc *ManagerMetricsCollector) Describe(ch chan<- *prometheus.Desc) { mc.reloadsTotal.Describe(ch) mc.reloadsError.Describe(ch) - mc.lastReloadStatus.Describe(ch) - mc.lastReloadTime.Describe(ch) + mc.configStale.Describe(ch) + mc.reloadsDuration.Describe(ch) } // Collect implements the prometheus.Collector interface Collect method. func (mc *ManagerMetricsCollector) Collect(ch chan<- prometheus.Metric) { mc.reloadsTotal.Collect(ch) mc.reloadsError.Collect(ch) - mc.lastReloadStatus.Collect(ch) - mc.lastReloadTime.Collect(ch) + mc.configStale.Collect(ch) + mc.reloadsDuration.Collect(ch) } // ManagerFakeCollector is a fake collector that will implement ManagerCollector interface. -// Used to initilise the ManagerCollector when metrics are disabled to avoid nil pointer errors. +// Used to initialize the ManagerCollector when metrics are disabled to avoid nil pointer errors. type ManagerFakeCollector struct{} // NewManagerFakeCollector creates a fake collector that implements ManagerCollector interface. diff --git a/internal/mode/static/nginx/runtime/manager.go b/internal/mode/static/nginx/runtime/manager.go index 641d572e07..377c9e3f86 100644 --- a/internal/mode/static/nginx/runtime/manager.go +++ b/internal/mode/static/nginx/runtime/manager.go @@ -13,8 +13,6 @@ import ( "time" "k8s.io/apimachinery/pkg/util/wait" - - nkgmetrics "github.com/nginxinc/nginx-kubernetes-gateway/internal/mode/static/metrics" ) const ( @@ -43,14 +41,21 @@ type Manager interface { Reload(ctx context.Context, configVersion int) error } +// ManagerCollector is an interface for the metrics of the NGINX runtime manager. +type ManagerCollector interface { + IncNginxReloadCount() + IncNginxReloadErrors() + UpdateLastReloadTime(ms time.Duration) +} + // ManagerImpl implements Manager. type ManagerImpl struct { verifyClient *verifyClient - metricsCollector nkgmetrics.ManagerCollector + metricsCollector ManagerCollector } // NewManagerImpl creates a new ManagerImpl. -func NewManagerImpl(mgrCollector nkgmetrics.ManagerCollector) *ManagerImpl { +func NewManagerImpl(mgrCollector ManagerCollector) *ManagerImpl { return &ManagerImpl{ verifyClient: newVerifyClient(nginxReloadTimeout), metricsCollector: mgrCollector, @@ -62,14 +67,12 @@ func (m *ManagerImpl) Reload(ctx context.Context, configVersion int) error { // We find the main NGINX PID on every reload because it will change if the NGINX container is restarted. pid, err := findMainProcess(ctx, os.Stat, os.ReadFile, pidFileTimeout) if err != nil { - m.metricsCollector.IncNginxReloadErrors() return fmt.Errorf("failed to find NGINX main process: %w", err) } childProcFile := fmt.Sprintf(childProcPathFmt, pid) previousChildProcesses, err := os.ReadFile(childProcFile) if err != nil { - m.metricsCollector.IncNginxReloadErrors() return err } From 46fc72f2b57d6c3013b3beae07dfce5728e23c3f Mon Sep 17 00:00:00 2001 From: Ciara Stacke Date: Tue, 19 Sep 2023 13:54:15 +0100 Subject: [PATCH 3/3] Renaming and other review feedback --- docs/monitoring.md | 2 +- internal/mode/static/manager.go | 33 ++++++++++--------- internal/mode/static/metrics/collector.go | 32 +++++++++--------- internal/mode/static/nginx/runtime/manager.go | 26 +++++++-------- 4 files changed, 48 insertions(+), 45 deletions(-) diff --git a/docs/monitoring.md b/docs/monitoring.md index 261e7b06a5..1a47141a7c 100644 --- a/docs/monitoring.md +++ b/docs/monitoring.md @@ -91,7 +91,7 @@ NGINX Kubernetes Gateway exports the following metrics: - nginx_reload_errors_total. Number of unsuccessful NGINX reloads. - nginx_stale_config. 1 means NKG failed to configure NGINX with the latest version of the configuration, which means NGINX is running with a stale version. - - nginx_last_reload_milliseconds. Duration in milliseconds of NGINX reloads. + - nginx_last_reload_milliseconds. Duration in milliseconds of NGINX reloads (histogram). - These metrics have the namespace `nginx_kubernetes_gateway`, and include the label `class` which is set to the Gateway class of NKG. For example, `nginx_kubernetes_gateway_nginx_reloads_total{class="nginx"}`. diff --git a/internal/mode/static/manager.go b/internal/mode/static/manager.go index 3876a40823..e84bc1404f 100644 --- a/internal/mode/static/manager.go +++ b/internal/mode/static/manager.go @@ -130,13 +130,9 @@ func StartManager(cfg config.Config) error { return fmt.Errorf("NGINX is not running: %w", err) } - var mgrCollector ngxruntime.ManagerCollector - mgrCollector = nkgmetrics.NewManagerFakeCollector() - if cfg.MetricsConfig.Enabled { - mgrCollector, err = configureNginxMetrics(cfg.GatewayClassName) - if err != nil { - return err - } + mgrCollector, err := createAndRegisterMetricsCollectors(cfg.MetricsConfig.Enabled, cfg.GatewayClassName) + if err != nil { + return fmt.Errorf("cannot create and register metrics collectors: %w", err) } statusUpdater := status.NewUpdater(status.UpdaterConfig{ @@ -356,19 +352,26 @@ func setInitialConfig( return updateControlPlane(&config, logger, eventRecorder, configName, logLevelSetter) } -func configureNginxMetrics(gatewayClassName string) (*nkgmetrics.ManagerMetricsCollector, error) { - constLabels := map[string]string{"class": gatewayClassName} +// createAndRegisterMetricsCollectors creates the NGINX status and NGINX runtime manager collectors, registers them, +// and returns the runtime manager collector to be used in the nginxRuntimeMgr. +func createAndRegisterMetricsCollectors(metricsEnabled bool, gwClassName string) (ngxruntime.ManagerCollector, error) { + if !metricsEnabled { + // return a no-op collector to avoid nil pointer errors when metrics are disabled + return nkgmetrics.NewManagerNoopCollector(), nil + } + constLabels := map[string]string{"class": gwClassName} + ngxCollector, err := nkgmetrics.NewNginxMetricsCollector(constLabels) if err != nil { - return nil, fmt.Errorf("cannot get NGINX metrics: %w", err) + return nil, fmt.Errorf("cannot create NGINX status metrics collector: %w", err) } - mgrCollector := nkgmetrics.NewManagerMetricsCollector(constLabels) - if err = metrics.Registry.Register(mgrCollector); err != nil { - return nil, fmt.Errorf("failed to register manager metrics collector: %w", err) + if err := metrics.Registry.Register(ngxCollector); err != nil { + return nil, fmt.Errorf("failed to register NGINX status metrics collector: %w", err) } - if err = metrics.Registry.Register(ngxCollector); err != nil { - return nil, fmt.Errorf("failed to register NGINX metrics collector: %w", err) + mgrCollector := nkgmetrics.NewManagerMetricsCollector(constLabels) + if err := metrics.Registry.Register(mgrCollector); err != nil { + return nil, fmt.Errorf("failed to register NGINX manager runtime metrics collector: %w", err) } return mgrCollector, nil diff --git a/internal/mode/static/metrics/collector.go b/internal/mode/static/metrics/collector.go index 6ecd171d3b..daa454ed85 100644 --- a/internal/mode/static/metrics/collector.go +++ b/internal/mode/static/metrics/collector.go @@ -47,7 +47,7 @@ func NewManagerMetricsCollector(constLabels map[string]string) *ManagerMetricsCo Namespace: metricsNamespace, Help: "Duration in milliseconds of NGINX reloads", ConstLabels: constLabels, - Buckets: []float64{100.0, 200.0, 300.0, 400.0, 500.0}, + Buckets: []float64{500, 1000, 5000, 10000, 30000}, }, ), } @@ -55,13 +55,13 @@ func NewManagerMetricsCollector(constLabels map[string]string) *ManagerMetricsCo } // IncNginxReloadCount increments the counter of successful NGINX reloads and sets the stale config status to false. -func (mc *ManagerMetricsCollector) IncNginxReloadCount() { +func (mc *ManagerMetricsCollector) IncReloadCount() { mc.reloadsTotal.Inc() mc.updateConfigStaleStatus(false) } // IncNginxReloadErrors increments the counter of NGINX reload errors and sets the stale config status to true. -func (mc *ManagerMetricsCollector) IncNginxReloadErrors() { +func (mc *ManagerMetricsCollector) IncReloadErrors() { mc.reloadsError.Inc() mc.updateConfigStaleStatus(true) } @@ -75,8 +75,8 @@ func (mc *ManagerMetricsCollector) updateConfigStaleStatus(stale bool) { mc.configStale.Set(status) } -// UpdateLastReloadTime updates the last NGINX reload time. -func (mc *ManagerMetricsCollector) UpdateLastReloadTime(duration time.Duration) { +// ObserveLastReloadTime adds the last NGINX reload time to the histogram. +func (mc *ManagerMetricsCollector) ObserveLastReloadTime(duration time.Duration) { mc.reloadsDuration.Observe(float64(duration / time.Millisecond)) } @@ -96,20 +96,20 @@ func (mc *ManagerMetricsCollector) Collect(ch chan<- prometheus.Metric) { mc.reloadsDuration.Collect(ch) } -// ManagerFakeCollector is a fake collector that will implement ManagerCollector interface. +// ManagerNoopCollector is a no-op collector that will implement ManagerCollector interface. // Used to initialize the ManagerCollector when metrics are disabled to avoid nil pointer errors. -type ManagerFakeCollector struct{} +type ManagerNoopCollector struct{} -// NewManagerFakeCollector creates a fake collector that implements ManagerCollector interface. -func NewManagerFakeCollector() *ManagerFakeCollector { - return &ManagerFakeCollector{} +// NewManagerNoopCollector creates a no-op collector that implements ManagerCollector interface. +func NewManagerNoopCollector() *ManagerNoopCollector { + return &ManagerNoopCollector{} } -// IncNginxReloadCount implements a fake IncNginxReloadCount. -func (mc *ManagerFakeCollector) IncNginxReloadCount() {} +// IncReloadCount implements a no-op IncReloadCount. +func (mc *ManagerNoopCollector) IncReloadCount() {} -// IncNginxReloadErrors implements a fake IncNginxReloadErrors. -func (mc *ManagerFakeCollector) IncNginxReloadErrors() {} +// IncReloadErrors implements a no-op IncReloadErrors. +func (mc *ManagerNoopCollector) IncReloadErrors() {} -// UpdateLastReloadTime implements a fake UpdateLastReloadTime. -func (mc *ManagerFakeCollector) UpdateLastReloadTime(_ time.Duration) {} +// ObserveLastReloadTime implements a no-op ObserveLastReloadTime. +func (mc *ManagerNoopCollector) ObserveLastReloadTime(_ time.Duration) {} diff --git a/internal/mode/static/nginx/runtime/manager.go b/internal/mode/static/nginx/runtime/manager.go index 377c9e3f86..4e64fa56f1 100644 --- a/internal/mode/static/nginx/runtime/manager.go +++ b/internal/mode/static/nginx/runtime/manager.go @@ -43,27 +43,27 @@ type Manager interface { // ManagerCollector is an interface for the metrics of the NGINX runtime manager. type ManagerCollector interface { - IncNginxReloadCount() - IncNginxReloadErrors() - UpdateLastReloadTime(ms time.Duration) + IncReloadCount() + IncReloadErrors() + ObserveLastReloadTime(ms time.Duration) } // ManagerImpl implements Manager. type ManagerImpl struct { verifyClient *verifyClient - metricsCollector ManagerCollector + managerCollector ManagerCollector } // NewManagerImpl creates a new ManagerImpl. -func NewManagerImpl(mgrCollector ManagerCollector) *ManagerImpl { +func NewManagerImpl(managerCollector ManagerCollector) *ManagerImpl { return &ManagerImpl{ verifyClient: newVerifyClient(nginxReloadTimeout), - metricsCollector: mgrCollector, + managerCollector: managerCollector, } } func (m *ManagerImpl) Reload(ctx context.Context, configVersion int) error { - t1 := time.Now() + start := time.Now() // We find the main NGINX PID on every reload because it will change if the NGINX container is restarted. pid, err := findMainProcess(ctx, os.Stat, os.ReadFile, pidFileTimeout) if err != nil { @@ -79,7 +79,7 @@ func (m *ManagerImpl) Reload(ctx context.Context, configVersion int) error { // send HUP signal to the NGINX main process reload configuration // See https://nginx.org/en/docs/control.html if err := syscall.Kill(pid, syscall.SIGHUP); err != nil { - m.metricsCollector.IncNginxReloadErrors() + m.managerCollector.IncReloadErrors() return fmt.Errorf("failed to send the HUP signal to NGINX main: %w", err) } @@ -90,18 +90,18 @@ func (m *ManagerImpl) Reload(ctx context.Context, configVersion int) error { os.ReadFile, childProcsTimeout, ); err != nil { - m.metricsCollector.IncNginxReloadErrors() + m.managerCollector.IncReloadErrors() return fmt.Errorf(noNewWorkersErrFmt, configVersion, err) } if err = m.verifyClient.waitForCorrectVersion(ctx, configVersion); err != nil { - m.metricsCollector.IncNginxReloadErrors() + m.managerCollector.IncReloadErrors() return err } - m.metricsCollector.IncNginxReloadCount() + m.managerCollector.IncReloadCount() - t2 := time.Now() - m.metricsCollector.UpdateLastReloadTime(t2.Sub(t1)) + finish := time.Now() + m.managerCollector.ObserveLastReloadTime(finish.Sub(start)) return nil }