From 2f7b83f82b90ee175870199f71d4149a08ffe005 Mon Sep 17 00:00:00 2001 From: Andrew Papsujko Date: Fri, 16 May 2025 12:32:51 +0300 Subject: [PATCH] feat: added metric that shows bytes held by non-active replication slot and according trigger --- documentation/metrics.md | 64 +++++++++++++++++++++++++- mamonsu/lib/default_config.py | 2 + mamonsu/lib/platform.py | 1 + mamonsu/plugins/pgsql/driver/pool.py | 26 ++++++++++- mamonsu/plugins/pgsql/replication.py | 69 ++++++++++++++++++++++++++-- mamonsu/tools/bootstrap/sql.py | 17 +++++++ 6 files changed, 172 insertions(+), 7 deletions(-) diff --git a/documentation/metrics.md b/documentation/metrics.md index 15ef4a1a..d507a8e4 100644 --- a/documentation/metrics.md +++ b/documentation/metrics.md @@ -3691,7 +3691,8 @@ Default config: ### Replication Default config: -        lag_more_than_in_sec = 300 +        lag_more_than_in_sec = 300\ +        critical_bytes_held_by_non_active_slot = 1073741824 bytes ### Items @@ -3763,6 +3764,37 @@ Default config: *Non-active Replication Slots* calculates as count of slots with `false` active status. +- **Bytes Held By Non-active Replication Slots** + + Zabbix item: + + + + + + + + + + + + + + + + + + + + + + + + + +
NamePostgreSQL Replication: Bytes held by non-active slot {#NON_ACTIVE_SLOT_NAME}
Keypgsql.replication.non_active_slots_discovery[]
TypeNumeric (float)
UnitsBytes
DeltaAs Is
Supported Version10+
+ + *Non-active Replication Slots* calculates as count of slots with `false` active status. - **Streaming Replication Lag** @@ -3861,12 +3893,40 @@ Default config: +- **PostgreSQL Replication: Non-active Slots Discovery** + + Items: + + + + + + + + + + + + + + + + + + + + + +
NamePostgreSQL Replication: Bytes held by non-active slot {#NON_ACTIVE_SLOT_NAME}
Keypgsql.replication.non_active_slots_discovery[]
TypeNumeric (float)
UnitsBytes
DeltaAs Is
+ ### Triggers - **PostgreSQL Instance: server mode has been changed on {HOSTNAME} to {ITEM.LASTVALUE}** - **PostgreSQL number of non-active replication slots on {HOSTNAME} (value={ITEM.LASTVALUE})** - + Disabled by default +- **PostgreSQL Replication: bytes held by slot {#NON_ACTIVE_SLOT_NAME} is too high (value={ITEM.LASTVALUE})** + Triggers if *PostgreSQL Replication: Bytes held by non-active slot {#NON_ACTIVE_SLOT_NAME}* exceeds `critical_bytes_held_by_non_active_slot`. - **PostgreSQL streaming lag too high on {HOSTNAME} (value={ITEM.LASTVALUE})** Triggers if *PostgreSQL Replication: Streaming Replication Lag* exceeds `lag_more_than_in_sec`. diff --git a/mamonsu/lib/default_config.py b/mamonsu/lib/default_config.py index c7f2d985..12791a18 100644 --- a/mamonsu/lib/default_config.py +++ b/mamonsu/lib/default_config.py @@ -35,6 +35,8 @@ def default_host(): host = os.environ.get('PGHOST') or 'auto' if platform.FREEBSD: host = os.environ.get('PGHOST') or 'auto' + if platform.DARWIN: + host = os.environ.get('PGHOST') or 'auto' return host @staticmethod diff --git a/mamonsu/lib/platform.py b/mamonsu/lib/platform.py index 5ea5faa0..279200d1 100644 --- a/mamonsu/lib/platform.py +++ b/mamonsu/lib/platform.py @@ -3,5 +3,6 @@ LINUX = (sys.platform == 'linux' or sys.platform == 'linux2') WINDOWS = (sys.platform == 'win32' or sys.platform == 'win64') FREEBSD = ('freebsd' in sys.platform) +DARWIN = sys.platform == 'darwin' UNIX = LINUX or FREEBSD INTEGER_TYPES = int, diff --git a/mamonsu/plugins/pgsql/driver/pool.py b/mamonsu/plugins/pgsql/driver/pool.py index 6576f92a..a8433d98 100644 --- a/mamonsu/plugins/pgsql/driver/pool.py +++ b/mamonsu/plugins/pgsql/driver/pool.py @@ -86,7 +86,7 @@ class Pool(object): """ SELECT application_name, {0} - coalesce((pg_{1}_{2}_diff(pg_current_{1}_{2}(), replay_lsn))::int, 0) AS total_lag + coalesce((pg_{1}_{2}_diff(pg_current_{1}_{2}(), replay_{2}))::int, 0) AS total_lag FROM pg_stat_replication; """, """ @@ -95,6 +95,30 @@ class Pool(object): total_lag FROM mamonsu.count_{1}_lag_lsn(); """ + ), + "wal_held_bytes_master": ( + """ + SELECT slot_name, + coalesce((pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn))::int, 0) AS wal_held_bytes + FROM pg_replication_slots; + """, + """ + SELECT slot_name, + wal_held_bytes + FROM mamonsu.bytes_held_by_inactive_slot_on_master(); + """ + ), + "wal_held_bytes_replica": ( + """ + SELECT slot_name, + coalesce((pg_wal_lsn_diff(pg_last_wal_replay_lsn(), restart_lsn))::int, 0) AS wal_held_bytes + FROM pg_replication_slots; + """, + """ + SELECT slot_name, + wal_held_bytes + FROM mamonsu.bytes_held_by_inactive_slot_on_replica(); + """ ) } diff --git a/mamonsu/plugins/pgsql/replication.py b/mamonsu/plugins/pgsql/replication.py index 8a51889a..7ed701c1 100644 --- a/mamonsu/plugins/pgsql/replication.py +++ b/mamonsu/plugins/pgsql/replication.py @@ -13,7 +13,8 @@ class Replication(Plugin): AgentPluginType = "pg" # key: (macro, value) plugin_macros = { - "critical_lag_seconds": [("macro", "{$CRITICAL_LAG_SECONDS}"), ("value", 60 * 5)] + "critical_lag_seconds": [("macro", "{$CRITICAL_LAG_SECONDS}"), ("value", 60 * 5)], + "critical_bytes_held_by_none_active_slot": [("macro", "{$CRITICAL_BYTES_HELD_BY_NON_ACTIVE_SLOT}"), ("value", 1024 * 1024 * 1024)] } # get time of replication lag @@ -30,8 +31,15 @@ class Replication(Plugin): WHERE active = 'false'; """ + query_bytes_held_by_non_active_slot = """ + SELECT slot_name, coalesce(pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn)::bigint, 0) AS wal_size_bytes + FROM pg_replication_slots + WHERE active = 'false'; + """ + # for discovery rule for name of each replica key_lsn_replication_discovery = "pgsql.replication.discovery{0}" + key_replication_non_active_slots_discovery = "pgsql.replication.non_active_slots_discovery{0}" key_total_lag = "pgsql.replication.total_lag{0}" # for PG 10 and higher key_flush = "pgsql.replication.flush_lag{0}" @@ -42,6 +50,7 @@ class Replication(Plugin): key_replication = "pgsql.replication_lag{0}" key_non_active_slots = "pgsql.replication.non_active_slots{0}" + key_non_active_slots_held_bytes = "pgsql.replication.non_active_slots_held_bytes{0}" def run(self, zbx): @@ -79,6 +88,14 @@ def run(self, zbx): zbx.send("pgsql.replication.replay_lag[{0}]".format(info[0]), float(info[5])) zbx.send("pgsql.replication.discovery[]", zbx.json({"data": lags})) del lags + bytes_held_by_non_active_slot = Pooler.run_sql_type("wal_held_bytes_master", args=[]) + if bytes_held_by_non_active_slot: + discovery = [] + for info in bytes_held_by_non_active_slot: + discovery.append({"{#NON_ACTIVE_SLOT_NAME}": info[0]}) + zbx.send("pgsql.replication.non_active_slots_held_bytes[{0}]".format(info[0]), int(info[1])) + zbx.send("pgsql.replication.non_active_slots_discovery[]", zbx.json({"data": discovery})) + del discovery elif Pooler.is_superuser() or Pooler.is_bootstraped(): result_lags = Pooler.run_sql_type("wal_lag_lsn", args=[" ", "xlog", "location"]) if result_lags: @@ -90,7 +107,15 @@ def run(self, zbx): del lags else: self.disable_and_exit_if_not_superuser() - + else: + bytes_held_by_non_active_slot = Pooler.run_sql_type("wal_held_bytes_replica", args=[]) + if bytes_held_by_non_active_slot: + discovery = [] + for info in bytes_held_by_non_active_slot: + discovery.append({"{#NON_ACTIVE_SLOT_NAME}": info[0]}) + zbx.send("pgsql.replication.non_active_slots_held_bytes[{0}]".format(info[0]), int(info[1])) + zbx.send("pgsql.replication.non_active_slots_discovery[]", zbx.json({"data": discovery})) + del discovery non_active_slots = Pooler.query(self.query_non_active_slots) zbx.send(self.key_non_active_slots.format("[]"), int(non_active_slots[0][0])) @@ -132,7 +157,8 @@ def triggers(self, template, dashboard=False): }) + template.trigger({ "name": "PostgreSQL Replication: number of non-active replication slots on {HOSTNAME} (value={ITEM.LASTVALUE})", "expression": "{#TEMPLATE:" + self.right_type(self.key_non_active_slots) + ".last()}>" + str( - NUMBER_NON_ACTIVE_SLOTS) + NUMBER_NON_ACTIVE_SLOTS), + "status": 1 }) return triggers @@ -198,7 +224,42 @@ def discovery_rules(self, template, dashboard=False): ] } ] - return template.discovery_rule(rule=rule, conditions=conditions, items=items, graphs=graphs) + active_slots_discovery_rule = template.discovery_rule(rule=rule, conditions=conditions, items=items, graphs=graphs) + + rule = { + "name": "PostgreSQL Replication: Non Active Slots Discovery", + "key": self.key_replication_non_active_slots_discovery.format("[{0}]".format(self.Macros[self.Type])) + } + if Plugin.old_zabbix: + conditions = [] + rule["filter"] = "{#NON_ACTIVE_SLOT_NAME}:.*" + else: + conditions = [{ + "condition": [ + {"macro": "{#NON_ACTIVE_SLOT_NAME}", + "value": ".*", + "operator": 8, + "formulaid": "A"} + ] + }] + items = [ + {"key": self.right_type(self.key_non_active_slots_held_bytes, var_discovery="{#NON_ACTIVE_SLOT_NAME},"), + "name": "PostgreSQL Replication: Bytes held by non-active slot {#NON_ACTIVE_SLOT_NAME}", + "value_type": Plugin.VALUE_TYPE.numeric_float, + "delay": self.plugin_config("interval"), + "drawtype": 2} + ] + graphs = [] + triggers = [ + { + "name": "PostgreSQL Replication: bytes held by slot {#NON_ACTIVE_SLOT_NAME} is too high (value={ITEM.LASTVALUE})", + "expression": "{#TEMPLATE:" + self.right_type(self.key_non_active_slots_held_bytes, var_discovery="{#NON_ACTIVE_SLOT_NAME},") + ".last()}>" + + self.plugin_macros["critical_bytes_held_by_none_active_slot"][0][1] + } + ] + non_active_slots_discovery_rule = template.discovery_rule(rule=rule, conditions=conditions, items=items, graphs=graphs, triggers=triggers) + + return active_slots_discovery_rule + non_active_slots_discovery_rule def keys_and_queries(self, template_zabbix): result = [] diff --git a/mamonsu/tools/bootstrap/sql.py b/mamonsu/tools/bootstrap/sql.py index f37be0f0..bf99442a 100644 --- a/mamonsu/tools/bootstrap/sql.py +++ b/mamonsu/tools/bootstrap/sql.py @@ -236,6 +236,23 @@ coalesce((pg_{7}_diff(pg_current_{7}(), replay_{9}))::bigint, 0) AS total_lag FROM pg_stat_replication $$ LANGUAGE SQL SECURITY DEFINER; + +DROP FUNCTION IF EXISTS mamonsu.bytes_held_by_inactive_slot_on_master(); +CREATE OR REPLACE FUNCTION mamonsu.bytes_held_by_inactive_slot_on_master() +RETURNS TABLE(slot_name TEXT, wal_held_bytes BIGINT) AS $$ +SELECT slot_name::TEXT, coalesce((pg_{7}_diff(pg_current_wal_lsn(), restart_lsn))::bigint, 0) AS wal_held_bytes +FROM pg_replication_slots +WHERE active = 'false' +$$ LANGUAGE SQL SECURITY DEFINER; + +DROP FUNCTION IF EXISTS mamonsu.bytes_held_by_inactive_slot_on_replica(); +CREATE OR REPLACE FUNCTION mamonsu.bytes_held_by_inactive_slot_on_replica() +RETURNS TABLE(slot_name TEXT, wal_held_bytes BIGINT) AS $$ +SELECT slot_name::TEXT, coalesce((pg_{7}_diff(pg_last_wal_replay_lsn(), restart_lsn))::bigint, 0) AS wal_held_bytes +FROM pg_replication_slots +WHERE active = 'false' +$$ LANGUAGE SQL SECURITY DEFINER; + """ CreatePgBuffercacheFunctionsSQL = """