From 3287c157cc038673834a3da1a0ce5b8e1b3e7e4e Mon Sep 17 00:00:00 2001 From: Shane Harvey Date: Fri, 28 Feb 2025 13:14:15 -0800 Subject: [PATCH 1/5] PYTHON-5168 Use logging for client background task errors --- pymongo/asynchronous/mongo_client.py | 8 ++++---- pymongo/logger.py | 6 ++++++ pymongo/synchronous/mongo_client.py | 8 ++++---- test/asynchronous/test_client.py | 23 +++++++++++++++++++++++ test/test_client.py | 23 +++++++++++++++++++++++ 5 files changed, 60 insertions(+), 8 deletions(-) diff --git a/pymongo/asynchronous/mongo_client.py b/pymongo/asynchronous/mongo_client.py index 37be9a194c..eefafd5fda 100644 --- a/pymongo/asynchronous/mongo_client.py +++ b/pymongo/asynchronous/mongo_client.py @@ -88,7 +88,7 @@ _async_create_lock, _release_locks, ) -from pymongo.logger import _CLIENT_LOGGER, _log_or_warn +from pymongo.logger import _CLIENT_LOGGER, _log_client_error, _log_or_warn from pymongo.message import _CursorAddress, _GetMore, _Query from pymongo.monitoring import ConnectionClosedReason from pymongo.operations import ( @@ -2049,7 +2049,7 @@ async def _process_kill_cursors(self) -> None: # can be caught in _process_periodic_tasks raise else: - helpers_shared._handle_exception() + _log_client_error() # Don't re-open topology if it's closed and there's no pending cursors. if address_to_cursor_ids: @@ -2061,7 +2061,7 @@ async def _process_kill_cursors(self) -> None: if isinstance(exc, InvalidOperation) and self._topology._closed: raise else: - helpers_shared._handle_exception() + _log_client_error() # This method is run periodically by a background thread. async def _process_periodic_tasks(self) -> None: @@ -2075,7 +2075,7 @@ async def _process_periodic_tasks(self) -> None: if isinstance(exc, InvalidOperation) and self._topology._closed: return else: - helpers_shared._handle_exception() + _log_client_error() def _return_server_session( self, server_session: Union[_ServerSession, _EmptyServerSession] diff --git a/pymongo/logger.py b/pymongo/logger.py index 2ff35328b4..b0195167ed 100644 --- a/pymongo/logger.py +++ b/pymongo/logger.py @@ -96,6 +96,12 @@ class _SDAMStatusMessage(str, enum.Enum): } +def _log_client_error() -> None: + logger = _CLIENT_LOGGER + if logger: + logger.exception("MongoClient background task encountered an error:") + + def _debug_log(logger: logging.Logger, **fields: Any) -> None: logger.debug(LogMessage(**fields)) diff --git a/pymongo/synchronous/mongo_client.py b/pymongo/synchronous/mongo_client.py index 373deabd4e..b101636066 100644 --- a/pymongo/synchronous/mongo_client.py +++ b/pymongo/synchronous/mongo_client.py @@ -80,7 +80,7 @@ _create_lock, _release_locks, ) -from pymongo.logger import _CLIENT_LOGGER, _log_or_warn +from pymongo.logger import _CLIENT_LOGGER, _log_client_error, _log_or_warn from pymongo.message import _CursorAddress, _GetMore, _Query from pymongo.monitoring import ConnectionClosedReason from pymongo.operations import ( @@ -2043,7 +2043,7 @@ def _process_kill_cursors(self) -> None: # can be caught in _process_periodic_tasks raise else: - helpers_shared._handle_exception() + _log_client_error() # Don't re-open topology if it's closed and there's no pending cursors. if address_to_cursor_ids: @@ -2055,7 +2055,7 @@ def _process_kill_cursors(self) -> None: if isinstance(exc, InvalidOperation) and self._topology._closed: raise else: - helpers_shared._handle_exception() + _log_client_error() # This method is run periodically by a background thread. def _process_periodic_tasks(self) -> None: @@ -2069,7 +2069,7 @@ def _process_periodic_tasks(self) -> None: if isinstance(exc, InvalidOperation) and self._topology._closed: return else: - helpers_shared._handle_exception() + _log_client_error() def _return_server_session( self, server_session: Union[_ServerSession, _EmptyServerSession] diff --git a/test/asynchronous/test_client.py b/test/asynchronous/test_client.py index 744a170be2..ba29e5ccb1 100644 --- a/test/asynchronous/test_client.py +++ b/test/asynchronous/test_client.py @@ -1790,6 +1790,29 @@ async def stall_connect(*args, **kwargs): # Each ping command should not take more than 2 seconds self.assertLess(total, 2) + async def test_background_connections_log_on_error(self): + with self.assertLogs("pymongo.client", level="ERROR") as cm: + client = await self.async_rs_or_single_client(minPoolSize=1) + # Create a single connection in the pool. + await client.admin.command("ping") + + # Cause new connections to fail. + pool = await async_get_pool(client) + + async def fail_connect(*args, **kwargs): + raise Exception("failed to connect") + + pool.connect = fail_connect + # Un-patch Pool.connect to break the cyclic reference. + self.addCleanup(delattr, pool, "connect") + + await pool.reset_without_pause() + + await async_wait_until(lambda: len(cm.records) > 0, "start creating connections") + log_output = "".join(cm.output) + self.assertIn("failed to connect", log_output) + self.assertIn("MongoClient background task encountered an error", log_output) + @async_client_context.require_replica_set async def test_direct_connection(self): # direct_connection=True should result in Single topology. diff --git a/test/test_client.py b/test/test_client.py index cdc7691c28..116af931ff 100644 --- a/test/test_client.py +++ b/test/test_client.py @@ -1747,6 +1747,29 @@ def stall_connect(*args, **kwargs): # Each ping command should not take more than 2 seconds self.assertLess(total, 2) + def test_background_connections_log_on_error(self): + with self.assertLogs("pymongo.client", level="ERROR") as cm: + client = self.rs_or_single_client(minPoolSize=1) + # Create a single connection in the pool. + client.admin.command("ping") + + # Cause new connections to fail. + pool = get_pool(client) + + def fail_connect(*args, **kwargs): + raise Exception("failed to connect") + + pool.connect = fail_connect + # Un-patch Pool.connect to break the cyclic reference. + self.addCleanup(delattr, pool, "connect") + + pool.reset_without_pause() + + wait_until(lambda: len(cm.records) > 0, "start creating connections") + log_output = "".join(cm.output) + self.assertIn("failed to connect", log_output) + self.assertIn("MongoClient background task encountered an error", log_output) + @client_context.require_replica_set def test_direct_connection(self): # direct_connection=True should result in Single topology. From 42d638fd6407716d76768fa7d88f3f40ebf5c82b Mon Sep 17 00:00:00 2001 From: Shane Harvey Date: Tue, 4 Mar 2025 16:05:29 -0800 Subject: [PATCH 2/5] PYTHON-5168 Adjust test_continuous_network_errors to be less flaky --- test/asynchronous/test_client.py | 10 +++++----- test/test_client.py | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/test/asynchronous/test_client.py b/test/asynchronous/test_client.py index ba29e5ccb1..800b6b5fe0 100644 --- a/test/asynchronous/test_client.py +++ b/test/asynchronous/test_client.py @@ -1847,20 +1847,20 @@ def server_description_count(): return i gc.collect() - with client_knobs(min_heartbeat_interval=0.003): + with client_knobs(min_heartbeat_interval=0.002): client = self.simple_client( - "invalid:27017", heartbeatFrequencyMS=3, serverSelectionTimeoutMS=150 + "invalid:27017", heartbeatFrequencyMS=2, serverSelectionTimeoutMS=200 ) initial_count = server_description_count() with self.assertRaises(ServerSelectionTimeoutError): await client.test.test.find_one() gc.collect() final_count = server_description_count() + await client.close() # If a bug like PYTHON-2433 is reintroduced then too many # ServerDescriptions will be kept alive and this test will fail: - # AssertionError: 19 != 46 within 15 delta (27 difference) - # On Python 3.11 we seem to get more of a delta. - self.assertAlmostEqual(initial_count, final_count, delta=20) + # AssertionError: 11 != 47 within 20 delta (36 difference) + self.assertAlmostEqual(initial_count, final_count, delta=30) @async_client_context.require_failCommand_fail_point async def test_network_error_message(self): diff --git a/test/test_client.py b/test/test_client.py index 116af931ff..b7a43841df 100644 --- a/test/test_client.py +++ b/test/test_client.py @@ -1804,20 +1804,20 @@ def server_description_count(): return i gc.collect() - with client_knobs(min_heartbeat_interval=0.003): + with client_knobs(min_heartbeat_interval=0.002): client = self.simple_client( - "invalid:27017", heartbeatFrequencyMS=3, serverSelectionTimeoutMS=150 + "invalid:27017", heartbeatFrequencyMS=2, serverSelectionTimeoutMS=200 ) initial_count = server_description_count() with self.assertRaises(ServerSelectionTimeoutError): client.test.test.find_one() gc.collect() final_count = server_description_count() + client.close() # If a bug like PYTHON-2433 is reintroduced then too many # ServerDescriptions will be kept alive and this test will fail: - # AssertionError: 19 != 46 within 15 delta (27 difference) - # On Python 3.11 we seem to get more of a delta. - self.assertAlmostEqual(initial_count, final_count, delta=20) + # AssertionError: 11 != 47 within 20 delta (36 difference) + self.assertAlmostEqual(initial_count, final_count, delta=30) @client_context.require_failCommand_fail_point def test_network_error_message(self): From c0b56cd42b39b0b188f93e903448cebf2c1c3988 Mon Sep 17 00:00:00 2001 From: Shane Harvey Date: Wed, 5 Mar 2025 10:38:52 -0800 Subject: [PATCH 3/5] PYTHON-5168 Fix race in test --- test/asynchronous/test_client.py | 8 ++++---- test/test_client.py | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/test/asynchronous/test_client.py b/test/asynchronous/test_client.py index 800b6b5fe0..acc815c8a4 100644 --- a/test/asynchronous/test_client.py +++ b/test/asynchronous/test_client.py @@ -1808,10 +1808,10 @@ async def fail_connect(*args, **kwargs): await pool.reset_without_pause() - await async_wait_until(lambda: len(cm.records) > 0, "start creating connections") - log_output = "".join(cm.output) - self.assertIn("failed to connect", log_output) - self.assertIn("MongoClient background task encountered an error", log_output) + await async_wait_until( + lambda: "failed to connect" in "".join(cm.output), "start creating connections" + ) + self.assertIn("MongoClient background task encountered an error", "".join(cm.output)) @async_client_context.require_replica_set async def test_direct_connection(self): diff --git a/test/test_client.py b/test/test_client.py index b7a43841df..8e99866cc8 100644 --- a/test/test_client.py +++ b/test/test_client.py @@ -1765,10 +1765,10 @@ def fail_connect(*args, **kwargs): pool.reset_without_pause() - wait_until(lambda: len(cm.records) > 0, "start creating connections") - log_output = "".join(cm.output) - self.assertIn("failed to connect", log_output) - self.assertIn("MongoClient background task encountered an error", log_output) + wait_until( + lambda: "failed to connect" in "".join(cm.output), "start creating connections" + ) + self.assertIn("MongoClient background task encountered an error", "".join(cm.output)) @client_context.require_replica_set def test_direct_connection(self): From ad7801bfc35b6a34d47dbe0c9a919236795de4c7 Mon Sep 17 00:00:00 2001 From: Shane Harvey Date: Wed, 5 Mar 2025 13:07:36 -0800 Subject: [PATCH 4/5] Revert "PYTHON-5168 Adjust test_continuous_network_errors to be less flaky" This reverts commit 42d638fd6407716d76768fa7d88f3f40ebf5c82b. --- test/asynchronous/test_client.py | 10 +++++----- test/test_client.py | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/test/asynchronous/test_client.py b/test/asynchronous/test_client.py index acc815c8a4..b0b226dd7c 100644 --- a/test/asynchronous/test_client.py +++ b/test/asynchronous/test_client.py @@ -1847,20 +1847,20 @@ def server_description_count(): return i gc.collect() - with client_knobs(min_heartbeat_interval=0.002): + with client_knobs(min_heartbeat_interval=0.003): client = self.simple_client( - "invalid:27017", heartbeatFrequencyMS=2, serverSelectionTimeoutMS=200 + "invalid:27017", heartbeatFrequencyMS=3, serverSelectionTimeoutMS=150 ) initial_count = server_description_count() with self.assertRaises(ServerSelectionTimeoutError): await client.test.test.find_one() gc.collect() final_count = server_description_count() - await client.close() # If a bug like PYTHON-2433 is reintroduced then too many # ServerDescriptions will be kept alive and this test will fail: - # AssertionError: 11 != 47 within 20 delta (36 difference) - self.assertAlmostEqual(initial_count, final_count, delta=30) + # AssertionError: 19 != 46 within 15 delta (27 difference) + # On Python 3.11 we seem to get more of a delta. + self.assertAlmostEqual(initial_count, final_count, delta=20) @async_client_context.require_failCommand_fail_point async def test_network_error_message(self): diff --git a/test/test_client.py b/test/test_client.py index 8e99866cc8..ccf30795a1 100644 --- a/test/test_client.py +++ b/test/test_client.py @@ -1804,20 +1804,20 @@ def server_description_count(): return i gc.collect() - with client_knobs(min_heartbeat_interval=0.002): + with client_knobs(min_heartbeat_interval=0.003): client = self.simple_client( - "invalid:27017", heartbeatFrequencyMS=2, serverSelectionTimeoutMS=200 + "invalid:27017", heartbeatFrequencyMS=3, serverSelectionTimeoutMS=150 ) initial_count = server_description_count() with self.assertRaises(ServerSelectionTimeoutError): client.test.test.find_one() gc.collect() final_count = server_description_count() - client.close() # If a bug like PYTHON-2433 is reintroduced then too many # ServerDescriptions will be kept alive and this test will fail: - # AssertionError: 11 != 47 within 20 delta (36 difference) - self.assertAlmostEqual(initial_count, final_count, delta=30) + # AssertionError: 19 != 46 within 15 delta (27 difference) + # On Python 3.11 we seem to get more of a delta. + self.assertAlmostEqual(initial_count, final_count, delta=20) @client_context.require_failCommand_fail_point def test_network_error_message(self): From 4e35cc3e47caa57e75ec8a850ea844cbbe24128d Mon Sep 17 00:00:00 2001 From: Shane Harvey Date: Wed, 5 Mar 2025 13:47:31 -0800 Subject: [PATCH 5/5] PYTHON-5168 Add comments --- pymongo/logger.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pymongo/logger.py b/pymongo/logger.py index b0195167ed..9079dc3f3d 100644 --- a/pymongo/logger.py +++ b/pymongo/logger.py @@ -97,8 +97,10 @@ class _SDAMStatusMessage(str, enum.Enum): def _log_client_error() -> None: + # This is called from a daemon thread so check for None to account for interpreter shutdown. logger = _CLIENT_LOGGER if logger: + # logger.exception includes the full traceback. logger.exception("MongoClient background task encountered an error:")