Disallow dual-sync-async persistence without restarting

TheBlueMatt · TheBlueMatt · commit 91a9fa4b1375 · 2025-04-15T15:29:18.000Z
In general, we don't expect users to persist `ChannelMonitor[Update]`s both synchronously and asynchronously for a single `ChannelManager` instance. If a user has implemented asynchronous persistence, they should generally always use that, as there is then no advantage to them to occasionally persist synchronously. Even still, in 920d96e we fixed some bugs related to such operation, and noted that "there isn't much cost to supporting it". Sadly, this is not true. Specifically, the dual-sync-async persistence flow is ill-defined and difficult to define in away that a user can realistically implement. Consider the case of a `ChannelMonitorUpdate` which is persisted asynchronously and while it is still being persisted a new `ChannelMonitorUpdate` is created. If the second `ChannelMonitorUpdate` is persisted synchronously, the `ChannelManager` will be left with a single pending `ChannelMonitorUpdate` which is not the latest. If we were to then restart, the latest copy of the `ChannelMonitor` would be that without any updates, but the `ChannelManager` has a pending `ChannelMonitorUpdate` for the next update, but not the one after that. The user would then have to handle the replayed `ChannelMonitorUpdate` and then find the second `ChannelMonitorUpdate` on disk and somehow know to replay that one as well. Further, we currently have a bug in handling this scenario as we'll complete all pending post-update actions when the second `ChannelMonitorUpdate` gets persisted synchronously, even though the first `ChannelMonitorUpdate` is still pending. While we could rather trivially fix these issues, addressing the larger API question above is difficult and as we don't anticipate this use-case being important, we just disable it here. Note that we continue to support it internally as some 39 tests rely on it. Issue highlighted by (changes to the) chanmon_consistency fuzz target (in the next commit).
diff --git a/fuzz/src/chanmon_consistency.rs b/fuzz/src/chanmon_consistency.rs
@@ -82,6 +82,8 @@ use bitcoin::secp256k1::{self, Message, PublicKey, Scalar, Secp256k1, SecretKey}
 
 use lightning::io::Cursor;
 use lightning::util::dyn_signer::DynSigner;
+
+use std::cell::RefCell;
 use std::cmp::{self, Ordering};
 use std::mem;
 use std::sync::atomic;
@@ -674,6 +676,9 @@ pub fn do_test<Out: Output>(data: &[u8], underlying_out: Out, anchors: bool) {
 		}};
 	}
 
+	let default_mon_style = RefCell::new(ChannelMonitorUpdateStatus::Completed);
+	let mon_style = [default_mon_style.clone(), default_mon_style.clone(), default_mon_style];
+
 	macro_rules! reload_node {
 		($ser: expr, $node_id: expr, $old_monitors: expr, $keys_manager: expr, $fee_estimator: expr) => {{
 			let keys_manager = Arc::clone(&$keys_manager);
@@ -746,6 +751,7 @@ pub fn do_test<Out: Output>(data: &[u8], underlying_out: Out, anchors: bool) {
 					Ok(ChannelMonitorUpdateStatus::Completed)
 				);
 			}
+			*chain_monitor.persister.update_ret.lock().unwrap() = *mon_style[$node_id].borrow();
 			res
 		}};
 	}
@@ -1393,28 +1399,22 @@ pub fn do_test<Out: Output>(data: &[u8], underlying_out: Out, anchors: bool) {
 			// bit-twiddling mutations to have similar effects. This is probably overkill, but no
 			// harm in doing so.
 			0x00 => {
-				*monitor_a.persister.update_ret.lock().unwrap() =
-					ChannelMonitorUpdateStatus::InProgress
+				*mon_style[0].borrow_mut() = ChannelMonitorUpdateStatus::InProgress;
 			},
 			0x01 => {
-				*monitor_b.persister.update_ret.lock().unwrap() =
-					ChannelMonitorUpdateStatus::InProgress
+				*mon_style[1].borrow_mut() = ChannelMonitorUpdateStatus::InProgress;
 			},
 			0x02 => {
-				*monitor_c.persister.update_ret.lock().unwrap() =
-					ChannelMonitorUpdateStatus::InProgress
+				*mon_style[2].borrow_mut() = ChannelMonitorUpdateStatus::InProgress;
 			},
 			0x04 => {
-				*monitor_a.persister.update_ret.lock().unwrap() =
-					ChannelMonitorUpdateStatus::Completed
+				*mon_style[0].borrow_mut() = ChannelMonitorUpdateStatus::Completed;
 			},
 			0x05 => {
-				*monitor_b.persister.update_ret.lock().unwrap() =
-					ChannelMonitorUpdateStatus::Completed
+				*mon_style[1].borrow_mut() = ChannelMonitorUpdateStatus::Completed;
 			},
 			0x06 => {
-				*monitor_c.persister.update_ret.lock().unwrap() =
-					ChannelMonitorUpdateStatus::Completed
+				*mon_style[2].borrow_mut() = ChannelMonitorUpdateStatus::Completed;
 			},
 
 			0x08 => complete_all_monitor_updates(&monitor_a, &chan_1_id),
@@ -1724,19 +1724,8 @@ pub fn do_test<Out: Output>(data: &[u8], underlying_out: Out, anchors: bool) {
 				// after we resolve all pending events.
 				// First make sure there are no pending monitor updates and further update
 				// operations complete.
-				*monitor_a.persister.update_ret.lock().unwrap() =
-					ChannelMonitorUpdateStatus::Completed;
-				*monitor_b.persister.update_ret.lock().unwrap() =
-					ChannelMonitorUpdateStatus::Completed;
-				*monitor_c.persister.update_ret.lock().unwrap() =
-					ChannelMonitorUpdateStatus::Completed;
-
-				complete_all_monitor_updates(&monitor_a, &chan_1_id);
-				complete_all_monitor_updates(&monitor_b, &chan_1_id);
-				complete_all_monitor_updates(&monitor_b, &chan_2_id);
-				complete_all_monitor_updates(&monitor_c, &chan_2_id);
-
-				// Next, make sure peers are all connected to each other
+
+				// First, make sure peers are all connected to each other
 				if chan_a_disconnected {
 					let init_1 = Init {
 						features: nodes[1].init_features(),
@@ -1769,42 +1758,65 @@ pub fn do_test<Out: Output>(data: &[u8], underlying_out: Out, anchors: bool) {
 				}
 
 				macro_rules! process_all_events {
-					() => {
+					() => { {
+						let mut last_pass_no_updates = false;
 						for i in 0..std::usize::MAX {
 							if i == 100 {
 								panic!("It may take may iterations to settle the state, but it should not take forever");
 							}
+							// Next, make sure no monitor updates are pending
+							complete_all_monitor_updates(&monitor_a, &chan_1_id);
+							complete_all_monitor_updates(&monitor_b, &chan_1_id);
+							complete_all_monitor_updates(&monitor_b, &chan_2_id);
+							complete_all_monitor_updates(&monitor_c, &chan_2_id);
 							// Then, make sure any current forwards make their way to their destination
 							if process_msg_events!(0, false, ProcessMessages::AllMessages) {
+								last_pass_no_updates = false;
 								continue;
 							}
 							if process_msg_events!(1, false, ProcessMessages::AllMessages) {
+								last_pass_no_updates = false;
 								continue;
 							}
 							if process_msg_events!(2, false, ProcessMessages::AllMessages) {
+								last_pass_no_updates = false;
 								continue;
 							}
 							// ...making sure any pending PendingHTLCsForwardable events are handled and
 							// payments claimed.
 							if process_events!(0, false) {
+								last_pass_no_updates = false;
 								continue;
 							}
 							if process_events!(1, false) {
+								last_pass_no_updates = false;
 								continue;
 							}
 							if process_events!(2, false) {
+								last_pass_no_updates = false;
 								continue;
 							}
-							break;
+							if last_pass_no_updates {
+								// In some cases, we may generate a message to send in
+								// `process_msg_events`, but block sending until
+								// `complete_all_monitor_updates` gets called on the next
+								// iteration.
+								//
+								// Thus, we only exit if we manage two iterations with no messages
+								// or events to process.
+								break;
+							}
+							last_pass_no_updates = true;
 						}
-					};
+					} };
 				}
 
-				// At this point, we may be pending quiescence, so we'll process all messages to
-				// ensure we can complete its handshake. We'll then exit quiescence and process all
-				// messages again, to resolve any pending HTLCs (only irrevocably committed ones)
-				// before attempting to send more payments.
+				// We may be pending quiescence, so first process all messages to ensure we can
+				// complete the quiescence handshake.
 				process_all_events!();
+
+				// Then exit quiescence and process all messages again, to resolve any pending
+				// HTLCs (only irrevocably committed ones) before attempting to send more payments.
 				nodes[0].exit_quiescence(&nodes[1].get_our_node_id(), &chan_a_id).unwrap();
 				nodes[1].exit_quiescence(&nodes[0].get_our_node_id(), &chan_a_id).unwrap();
 				nodes[1].exit_quiescence(&nodes[2].get_our_node_id(), &chan_b_id).unwrap();
diff --git a/lightning/src/chain/mod.rs b/lightning/src/chain/mod.rs
@@ -209,6 +209,12 @@ pub enum ChannelMonitorUpdateStatus {
 	///
 	/// This includes performing any `fsync()` calls required to ensure the update is guaranteed to
 	/// be available on restart even if the application crashes.
+	///
+	/// If you return this variant, you cannot later return [`InProgress`] from the same instance of
+	/// [`Persist`]/[`Watch`] without first restarting.
+	///
+	/// [`InProgress`]: ChannelMonitorUpdateStatus::InProgress
+	/// [`Persist`]: chainmonitor::Persist
 	Completed,
 	/// Indicates that the update will happen asynchronously in the background or that a transient
 	/// failure occurred which is being retried in the background and will eventually complete.
@@ -234,7 +240,12 @@ pub enum ChannelMonitorUpdateStatus {
 	/// reliable, this feature is considered beta, and a handful of edge-cases remain. Until the
 	/// remaining cases are fixed, in rare cases, *using this feature may lead to funds loss*.
 	///
+	/// If you return this variant, you cannot later return [`Completed`] from the same instance of
+	/// [`Persist`]/[`Watch`] without first restarting.
+	///
 	/// [`InProgress`]: ChannelMonitorUpdateStatus::InProgress
+	/// [`Completed`]: ChannelMonitorUpdateStatus::Completed
+	/// [`Persist`]: chainmonitor::Persist
 	InProgress,
 	/// Indicates that an update has failed and will not complete at any point in the future.
 	///
diff --git a/lightning/src/ln/channelmanager.rs b/lightning/src/ln/channelmanager.rs
@@ -2569,6 +2569,13 @@ where
 	#[cfg(any(test, feature = "_test_utils"))]
 	pub(super) per_peer_state: FairRwLock<HashMap<PublicKey, Mutex<PeerState<SP>>>>,
 
+	/// We only support using one of [`ChannelMonitorUpdateStatus::InProgress`] and
+	/// [`ChannelMonitorUpdateStatus::Completed`] without restarting. Because the API does not
+	/// otherwise directly enforce this, we enforce it in debug builds here by storing which one is
+	/// in use.
+	#[cfg(all(not(test), debug_assertions))]
+	monitor_update_type: AtomicUsize,
+
 	/// The set of events which we need to give to the user to handle. In some cases an event may
 	/// require some further action after the user handles it (currently only blocking a monitor
 	/// update from being handed to the user to ensure the included changes to the channel state
@@ -3312,11 +3319,19 @@ macro_rules! handle_new_monitor_update {
 				panic!("{}", err_str);
 			},
 			ChannelMonitorUpdateStatus::InProgress => {
+				#[cfg(all(not(test), debug_assertions))]
+				if $self.monitor_update_type.swap(1, Ordering::Relaxed) == 2 {
+					panic!("Cannot use both ChannelMonitorUpdateStatus modes InProgress and Completed without restart");
+				}
 				log_debug!($logger, "ChannelMonitor update for {} in flight, holding messages until the update completes.",
 					$channel_id);
 				false
 			},
 			ChannelMonitorUpdateStatus::Completed => {
+				#[cfg(all(not(test), debug_assertions))]
+				if $self.monitor_update_type.swap(2, Ordering::Relaxed) == 1 {
+					panic!("Cannot use both ChannelMonitorUpdateStatus modes InProgress and Completed without restart");
+				}
 				$completed;
 				true
 			},
@@ -3577,6 +3592,9 @@ where
 
 			per_peer_state: FairRwLock::new(new_hash_map()),
 
+			#[cfg(all(not(test), debug_assertions))]
+			monitor_update_type: AtomicUsize::new(0),
+
 			pending_events: Mutex::new(VecDeque::new()),
 			pending_events_processor: AtomicBool::new(false),
 			pending_background_events: Mutex::new(Vec::new()),
@@ -14747,6 +14765,9 @@ where
 
 			per_peer_state: FairRwLock::new(per_peer_state),
 
+			#[cfg(all(not(test), debug_assertions))]
+			monitor_update_type: AtomicUsize::new(0),
+
 			pending_events: Mutex::new(pending_events_read),
 			pending_events_processor: AtomicBool::new(false),
 			pending_background_events: Mutex::new(pending_background_events),