From f2bb931ef98c38e681b35d0a3583fb65f0d463e0 Mon Sep 17 00:00:00 2001 From: Matt Corallo Date: Sun, 10 Sep 2023 22:11:56 +0000 Subject: [PATCH 01/11] Rewrite failure payment retry tests to avoid perm-fail storage Two tests in the payment tests currently rely on failing to persist ChannelMonitorUpdates as their method of failing payments before they even get out the door. In the coming commits we'll drop the persist failure error codes, so here rewrite these tests to rely on trying to send more than is available in a channel. --- lightning/src/ln/payment_tests.rs | 79 +++++++++++++++---------------- 1 file changed, 39 insertions(+), 40 deletions(-) diff --git a/lightning/src/ln/payment_tests.rs b/lightning/src/ln/payment_tests.rs index 3def4e3629b..e5af136656f 100644 --- a/lightning/src/ln/payment_tests.rs +++ b/lightning/src/ln/payment_tests.rs @@ -2258,12 +2258,14 @@ fn auto_retry_partial_failure() { let node_chanmgrs = create_node_chanmgrs(2, &node_cfgs, &[None, None]); let mut nodes = create_network(2, &node_cfgs, &node_chanmgrs); + // Open three channels, the first has plenty of liquidity, the second and third have ~no + // available liquidity, causing any outbound payments routed over it to fail immediately. let chan_1_id = create_announced_chan_between_nodes(&nodes, 0, 1).0.contents.short_channel_id; - let chan_2_id = create_announced_chan_between_nodes(&nodes, 0, 1).0.contents.short_channel_id; - let chan_3_id = create_announced_chan_between_nodes(&nodes, 0, 1).0.contents.short_channel_id; + let chan_2_id = create_announced_chan_between_nodes_with_value(&nodes, 0, 1, 1_000_000, 989_000_000).0.contents.short_channel_id; + let chan_3_id = create_announced_chan_between_nodes_with_value(&nodes, 0, 1, 1_000_000, 989_000_000).0.contents.short_channel_id; // Marshall data to send the payment - let amt_msat = 20_000; + let amt_msat = 10_000_000; let (_, payment_hash, payment_preimage, payment_secret) = get_route_and_payment_hash!(&nodes[0], nodes[1], amt_msat); #[cfg(feature = "std")] let payment_expiry_secs = SystemTime::UNIX_EPOCH.elapsed().unwrap().as_secs() + 60 * 60; @@ -2278,16 +2280,6 @@ fn auto_retry_partial_failure() { .with_bolt11_features(invoice_features).unwrap(); let route_params = RouteParameters::from_payment_params_and_value(payment_params, amt_msat); - // Ensure the first monitor update (for the initial send path1 over chan_1) succeeds, but the - // second (for the initial send path2 over chan_2) fails. - chanmon_cfgs[0].persister.set_update_ret(ChannelMonitorUpdateStatus::Completed); - chanmon_cfgs[0].persister.set_update_ret(ChannelMonitorUpdateStatus::PermanentFailure); - // Ensure third monitor update (for the retry1's path1 over chan_1) succeeds, but the fourth (for - // the retry1's path2 over chan_3) fails, and monitor updates succeed after that. - chanmon_cfgs[0].persister.set_update_ret(ChannelMonitorUpdateStatus::Completed); - chanmon_cfgs[0].persister.set_update_ret(ChannelMonitorUpdateStatus::PermanentFailure); - chanmon_cfgs[0].persister.set_update_ret(ChannelMonitorUpdateStatus::Completed); - // Configure the initial send, retry1 and retry2's paths. let send_route = Route { paths: vec![ @@ -2364,32 +2356,23 @@ fn auto_retry_partial_failure() { // Send a payment that will partially fail on send, then partially fail on retry, then succeed. nodes[0].node.send_payment(payment_hash, RecipientOnionFields::secret_only(payment_secret), PaymentId(payment_hash.0), route_params, Retry::Attempts(3)).unwrap(); - let closed_chan_events = nodes[0].node.get_and_clear_pending_events(); - assert_eq!(closed_chan_events.len(), 4); - match closed_chan_events[0] { - Event::ChannelClosed { .. } => {}, - _ => panic!("Unexpected event"), - } - match closed_chan_events[1] { + let payment_failed_events = nodes[0].node.get_and_clear_pending_events(); + assert_eq!(payment_failed_events.len(), 2); + match payment_failed_events[0] { Event::PaymentPathFailed { .. } => {}, _ => panic!("Unexpected event"), } - match closed_chan_events[2] { - Event::ChannelClosed { .. } => {}, - _ => panic!("Unexpected event"), - } - match closed_chan_events[3] { + match payment_failed_events[1] { Event::PaymentPathFailed { .. } => {}, _ => panic!("Unexpected event"), } // Pass the first part of the payment along the path. - check_added_monitors!(nodes[0], 5); // three outbound channel updates succeeded, two permanently failed + check_added_monitors!(nodes[0], 1); // only one HTLC actually made it out let mut msg_events = nodes[0].node.get_and_clear_pending_msg_events(); - // First message is the first update_add, remaining messages are broadcasting channel updates and - // errors for the permfailed channels - assert_eq!(msg_events.len(), 5); + // Only one HTLC/channel update actually made it out + assert_eq!(msg_events.len(), 1); let mut payment_event = SendEvent::from_event(msg_events.remove(0)); nodes[1].node.handle_update_add_htlc(&nodes[0].node.get_our_node_id(), &payment_event.msgs[0]); @@ -2478,12 +2461,13 @@ fn auto_retry_zero_attempts_send_error() { let node_chanmgrs = create_node_chanmgrs(2, &node_cfgs, &[None, None]); let mut nodes = create_network(2, &node_cfgs, &node_chanmgrs); - create_announced_chan_between_nodes(&nodes, 0, 1).0.contents.short_channel_id; - create_announced_chan_between_nodes(&nodes, 0, 1).0.contents.short_channel_id; + // Open a single channel that does not have sufficient liquidity for the payment we want to + // send. + let chan_id = create_announced_chan_between_nodes_with_value(&nodes, 0, 1, 1_000_000, 989_000_000).0.contents.short_channel_id; // Marshall data to send the payment - let amt_msat = 20_000; - let (_, payment_hash, _, payment_secret) = get_route_and_payment_hash!(&nodes[0], nodes[1], amt_msat); + let amt_msat = 10_000_000; + let (_, payment_hash, payment_secret) = get_payment_preimage_hash(&nodes[1], Some(amt_msat), None); #[cfg(feature = "std")] let payment_expiry_secs = SystemTime::UNIX_EPOCH.elapsed().unwrap().as_secs() + 60 * 60; #[cfg(not(feature = "std"))] @@ -2497,16 +2481,31 @@ fn auto_retry_zero_attempts_send_error() { .with_bolt11_features(invoice_features).unwrap(); let route_params = RouteParameters::from_payment_params_and_value(payment_params, amt_msat); - chanmon_cfgs[0].persister.set_update_ret(ChannelMonitorUpdateStatus::PermanentFailure); + // Override the route search to return a route, rather than failing at the route-finding step. + let send_route = Route { + paths: vec![ + Path { hops: vec![RouteHop { + pubkey: nodes[1].node.get_our_node_id(), + node_features: nodes[1].node.node_features(), + short_channel_id: chan_id, + channel_features: nodes[1].node.channel_features(), + fee_msat: amt_msat, + cltv_expiry_delta: 100, + maybe_announced_channel: true, + }], blinded_tail: None }, + ], + route_params: Some(route_params.clone()), + }; + nodes[0].router.expect_find_route(route_params.clone(), Ok(send_route)); + nodes[0].node.send_payment(payment_hash, RecipientOnionFields::secret_only(payment_secret), PaymentId(payment_hash.0), route_params, Retry::Attempts(0)).unwrap(); - assert_eq!(nodes[0].node.get_and_clear_pending_msg_events().len(), 2); // channel close messages + assert!(nodes[0].node.get_and_clear_pending_msg_events().is_empty()); let events = nodes[0].node.get_and_clear_pending_events(); - assert_eq!(events.len(), 3); - if let Event::ChannelClosed { .. } = events[0] { } else { panic!(); } - if let Event::PaymentPathFailed { .. } = events[1] { } else { panic!(); } - if let Event::PaymentFailed { .. } = events[2] { } else { panic!(); } - check_added_monitors!(nodes[0], 2); + assert_eq!(events.len(), 2); + if let Event::PaymentPathFailed { .. } = events[0] { } else { panic!(); } + if let Event::PaymentFailed { .. } = events[1] { } else { panic!(); } + check_added_monitors!(nodes[0], 0); } #[test] From 23c5308bcbea61d89b1e5d3df6b6da721bc74c2a Mon Sep 17 00:00:00 2001 From: Matt Corallo Date: Sun, 10 Sep 2023 17:14:32 +0000 Subject: [PATCH 02/11] Drop the `ChannelMonitorUpdateStatus::PermanentFailure` variant When a `ChannelMonitorUpdate` fails to apply, it generally means we cannot reach our storage backend. This, in general, is a critical issue, but is often only a transient issue. Sadly, users see the failure variant and return it on any I/O error, resulting in channel force-closures due to transient issues. Users don't generally expect force-closes in most cases, and luckily with async `ChannelMonitorUpdate`s supported we don't take any risk by "delaying" the `ChannelMonitorUpdate` indefinitely. Thus, here we drop the `PermanentFailure` variant entirely, making all failures instead be "the update is in progress, but won't ever complete", which is equivalent if we do not close the channel automatically. --- fuzz/src/chanmon_consistency.rs | 4 +- lightning-persister/src/fs_store.rs | 6 +- lightning/src/chain/chainmonitor.rs | 74 +++-------- lightning/src/chain/channelmonitor.rs | 32 ++--- lightning/src/chain/mod.rs | 72 +++------- lightning/src/ln/chanmon_update_fail_tests.rs | 123 ++---------------- lightning/src/ln/channelmanager.rs | 106 +++++++-------- lightning/src/ln/functional_test_utils.rs | 4 +- lightning/src/ln/functional_tests.rs | 12 +- lightning/src/ln/reload_tests.rs | 2 +- lightning/src/util/persist.rs | 8 +- lightning/src/util/test_utils.rs | 2 +- 12 files changed, 129 insertions(+), 316 deletions(-) diff --git a/fuzz/src/chanmon_consistency.rs b/fuzz/src/chanmon_consistency.rs index b6d41fb9914..6a2007165d7 100644 --- a/fuzz/src/chanmon_consistency.rs +++ b/fuzz/src/chanmon_consistency.rs @@ -138,7 +138,7 @@ impl TestChainMonitor { } } impl chain::Watch for TestChainMonitor { - fn watch_channel(&self, funding_txo: OutPoint, monitor: channelmonitor::ChannelMonitor) -> chain::ChannelMonitorUpdateStatus { + fn watch_channel(&self, funding_txo: OutPoint, monitor: channelmonitor::ChannelMonitor) -> Result { let mut ser = VecWriter(Vec::new()); monitor.write(&mut ser).unwrap(); if let Some(_) = self.latest_monitors.lock().unwrap().insert(funding_txo, (monitor.get_latest_update_id(), ser.0)) { @@ -500,7 +500,7 @@ pub fn do_test(data: &[u8], underlying_out: Out) { let res = (<(BlockHash, ChanMan)>::read(&mut Cursor::new(&$ser.0), read_args).expect("Failed to read manager").1, chain_monitor.clone()); for (funding_txo, mon) in monitors.drain() { assert_eq!(chain_monitor.chain_monitor.watch_channel(funding_txo, mon), - ChannelMonitorUpdateStatus::Completed); + Ok(ChannelMonitorUpdateStatus::Completed)); } res } } diff --git a/lightning-persister/src/fs_store.rs b/lightning-persister/src/fs_store.rs index 81f9709c454..6725e16974b 100644 --- a/lightning-persister/src/fs_store.rs +++ b/lightning-persister/src/fs_store.rs @@ -436,7 +436,7 @@ mod tests { } // Test that if the store's path to channel data is read-only, writing a - // monitor to it results in the store returning a PermanentFailure. + // monitor to it results in the store returning an InProgress. // Windows ignores the read-only flag for folders, so this test is Unix-only. #[cfg(not(target_os = "windows"))] #[test] @@ -470,7 +470,7 @@ mod tests { index: 0 }; match store.persist_new_channel(test_txo, &added_monitors[0].1, update_id.2) { - ChannelMonitorUpdateStatus::PermanentFailure => {}, + ChannelMonitorUpdateStatus::InProgress => {}, _ => panic!("unexpected result from persisting new channel") } @@ -507,7 +507,7 @@ mod tests { index: 0 }; match store.persist_new_channel(test_txo, &added_monitors[0].1, update_id.2) { - ChannelMonitorUpdateStatus::PermanentFailure => {}, + ChannelMonitorUpdateStatus::InProgress => {}, _ => panic!("unexpected result from persisting new channel") } diff --git a/lightning/src/chain/chainmonitor.rs b/lightning/src/chain/chainmonitor.rs index b3b62a2e870..e17e2dbd34b 100644 --- a/lightning/src/chain/chainmonitor.rs +++ b/lightning/src/chain/chainmonitor.rs @@ -78,7 +78,7 @@ impl MonitorUpdateId { /// `Persist` defines behavior for persisting channel monitors: this could mean /// writing once to disk, and/or uploading to one or more backup services. /// -/// Each method can return three possible values: +/// Each method can return two possible values: /// * If persistence (including any relevant `fsync()` calls) happens immediately, the /// implementation should return [`ChannelMonitorUpdateStatus::Completed`], indicating normal /// channel operation should continue. @@ -91,10 +91,9 @@ impl MonitorUpdateId { /// Note that unlike the direct [`chain::Watch`] interface, /// [`ChainMonitor::channel_monitor_updated`] must be called once for *each* update which occurs. /// -/// * If persistence fails for some reason, implementations should return -/// [`ChannelMonitorUpdateStatus::PermanentFailure`], in which case the channel will likely be -/// closed without broadcasting the latest state. See -/// [`ChannelMonitorUpdateStatus::PermanentFailure`] for more details. +/// If persistence fails for some reason, implementations should still return +/// [`ChannelMonitorUpdateStatus::InProgress`] and attempt to shut down or otherwise resolve the +/// situation ASAP. /// /// Third-party watchtowers may be built as a part of an implementation of this trait, with the /// advantage that you can control whether to resume channel operation depending on if an update @@ -335,11 +334,6 @@ where C::Target: chain::Filter, match self.persister.update_persisted_channel(*funding_outpoint, None, monitor, update_id) { ChannelMonitorUpdateStatus::Completed => log_trace!(self.logger, "Finished syncing Channel Monitor for channel {}", log_funding_info!(monitor)), - ChannelMonitorUpdateStatus::PermanentFailure => { - monitor_state.channel_perm_failed.store(true, Ordering::Release); - self.pending_monitor_events.lock().unwrap().push((*funding_outpoint, vec![MonitorEvent::UpdateFailed(*funding_outpoint)], monitor.get_counterparty_node_id())); - self.event_notifier.notify(); - } ChannelMonitorUpdateStatus::InProgress => { log_debug!(self.logger, "Channel Monitor sync for channel {} in progress, holding events until completion!", log_funding_info!(monitor)); pending_monitor_updates.push(update_id); @@ -673,12 +667,12 @@ where C::Target: chain::Filter, /// /// Note that we persist the given `ChannelMonitor` while holding the `ChainMonitor` /// monitors lock. - fn watch_channel(&self, funding_outpoint: OutPoint, monitor: ChannelMonitor) -> ChannelMonitorUpdateStatus { + fn watch_channel(&self, funding_outpoint: OutPoint, monitor: ChannelMonitor) -> Result { let mut monitors = self.monitors.write().unwrap(); let entry = match monitors.entry(funding_outpoint) { hash_map::Entry::Occupied(_) => { log_error!(self.logger, "Failed to add new channel data: channel monitor for given outpoint is already present"); - return ChannelMonitorUpdateStatus::PermanentFailure + return Err(()); }, hash_map::Entry::Vacant(e) => e, }; @@ -691,10 +685,6 @@ where C::Target: chain::Filter, log_info!(self.logger, "Persistence of new ChannelMonitor for channel {} in progress", log_funding_info!(monitor)); pending_monitor_updates.push(update_id); }, - ChannelMonitorUpdateStatus::PermanentFailure => { - log_error!(self.logger, "Persistence of new ChannelMonitor for channel {} failed", log_funding_info!(monitor)); - return persist_res; - }, ChannelMonitorUpdateStatus::Completed => { log_info!(self.logger, "Persistence of new ChannelMonitor for channel {} completed", log_funding_info!(monitor)); } @@ -708,7 +698,7 @@ where C::Target: chain::Filter, channel_perm_failed: AtomicBool::new(false), last_chain_persist_height: AtomicUsize::new(self.highest_chain_height.load(Ordering::Acquire)), }); - persist_res + Ok(persist_res) } /// Note that we persist the given `ChannelMonitor` update while holding the @@ -723,10 +713,10 @@ where C::Target: chain::Filter, // We should never ever trigger this from within ChannelManager. Technically a // user could use this object with some proxying in between which makes this // possible, but in tests and fuzzing, this should be a panic. - #[cfg(any(test, fuzzing))] + #[cfg(debug_assertions)] panic!("ChannelManager generated a channel update for a channel that was not yet registered!"); - #[cfg(not(any(test, fuzzing)))] - ChannelMonitorUpdateStatus::PermanentFailure + #[cfg(not(debug_assertions))] + ChannelMonitorUpdateStatus::InProgress }, Some(monitor_state) => { let monitor = &monitor_state.monitor; @@ -745,18 +735,14 @@ where C::Target: chain::Filter, pending_monitor_updates.push(update_id); log_debug!(self.logger, "Persistence of ChannelMonitorUpdate for channel {} in progress", log_funding_info!(monitor)); }, - ChannelMonitorUpdateStatus::PermanentFailure => { - monitor_state.channel_perm_failed.store(true, Ordering::Release); - log_error!(self.logger, "Persistence of ChannelMonitorUpdate for channel {} failed", log_funding_info!(monitor)); - }, ChannelMonitorUpdateStatus::Completed => { log_debug!(self.logger, "Persistence of ChannelMonitorUpdate for channel {} completed", log_funding_info!(monitor)); }, } if update_res.is_err() { - ChannelMonitorUpdateStatus::PermanentFailure + ChannelMonitorUpdateStatus::InProgress } else if monitor_state.channel_perm_failed.load(Ordering::Acquire) { - ChannelMonitorUpdateStatus::PermanentFailure + ChannelMonitorUpdateStatus::InProgress } else { persist_res } @@ -831,12 +817,12 @@ impl ChannelMonitor { self.inner.lock().unwrap().counterparty_node_id } - /// Used by ChannelManager deserialization to broadcast the latest holder state if its copy of - /// the Channel was out-of-date. + /// Used by [`ChannelManager`] deserialization to broadcast the latest holder state if its copy + /// of the channel state was out-of-date. /// /// You may also use this to broadcast the latest local commitment transaction, either because - /// a monitor update failed with [`ChannelMonitorUpdateStatus::PermanentFailure`] or because we've - /// fallen behind (i.e. we've received proof that our counterparty side knows a revocation - /// secret we gave them that they shouldn't know). + /// a monitor update failed or because we've fallen behind (i.e. we've received proof that our + /// counterparty side knows a revocation secret we gave them that they shouldn't know). /// /// Broadcasting these transactions in the second case is UNSAFE, as they allow counterparty /// side to punish you. Nevertheless you may want to broadcast them if counterparty doesn't /// close channel with their commitment transaction after a substantial amount of time. Best /// may be to contact the other node operator out-of-band to coordinate other options available - /// to you. In any-case, the choice is up to you. + /// to you. /// - /// [`ChannelMonitorUpdateStatus::PermanentFailure`]: super::ChannelMonitorUpdateStatus::PermanentFailure + /// [`ChannelManager`]: crate::ln::channelmanager::ChannelManager pub fn get_latest_holder_commitment_txn(&self, logger: &L) -> Vec where L::Target: Logger { self.inner.lock().unwrap().get_latest_holder_commitment_txn(logger) @@ -2599,6 +2595,7 @@ impl ChannelMonitorImpl { ChannelMonitorUpdateStep::CommitmentSecret { idx, secret } => { log_trace!(logger, "Updating ChannelMonitor with commitment secret"); if let Err(e) = self.provide_secret(*idx, *secret) { + debug_assert!(false, "Latest counterparty commitment secret was invalid"); log_error!(logger, "Providing latest counterparty commitment secret failed/was refused:"); log_error!(logger, " {}", e); ret = Err(()); @@ -4413,13 +4410,12 @@ mod tests { use crate::chain::chaininterface::LowerBoundedFeeEstimator; use super::ChannelMonitorUpdateStep; - use crate::{check_added_monitors, check_closed_broadcast, check_closed_event, check_spends, get_local_commitment_txn, get_monitor, get_route_and_payment_hash, unwrap_send_err}; + use crate::{check_added_monitors, check_spends, get_local_commitment_txn, get_monitor, get_route_and_payment_hash, unwrap_send_err}; use crate::chain::{BestBlock, Confirm}; use crate::chain::channelmonitor::ChannelMonitor; use crate::chain::package::{weight_offered_htlc, weight_received_htlc, weight_revoked_offered_htlc, weight_revoked_received_htlc, WEIGHT_REVOKED_OUTPUT}; use crate::chain::transaction::OutPoint; use crate::sign::InMemorySigner; - use crate::events::ClosureReason; use crate::ln::{PaymentPreimage, PaymentHash}; use crate::ln::chan_utils; use crate::ln::chan_utils::{HTLCOutputInCommitment, ChannelPublicKeys, ChannelTransactionParameters, HolderCommitmentTransaction, CounterpartyChannelTransactionParameters}; @@ -4485,18 +4481,14 @@ mod tests { let (route, payment_hash, _, payment_secret) = get_route_and_payment_hash!(nodes[1], nodes[0], 100_000); unwrap_send_err!(nodes[1].node.send_payment_with_route(&route, payment_hash, RecipientOnionFields::secret_only(payment_secret), PaymentId(payment_hash.0) - ), true, APIError::ChannelUnavailable { ref err }, - assert!(err.contains("ChannelMonitor storage failure"))); - check_added_monitors!(nodes[1], 2); // After the failure we generate a close-channel monitor update - check_closed_broadcast!(nodes[1], true); - check_closed_event!(nodes[1], 1, ClosureReason::ProcessingError { err: "ChannelMonitor storage failure".to_string() }, - [nodes[0].node.get_our_node_id()], 100000); + ), false, APIError::MonitorUpdateInProgress, {}); + check_added_monitors!(nodes[1], 1); // Build a new ChannelMonitorUpdate which contains both the failing commitment tx update // and provides the claim preimages for the two pending HTLCs. The first update generates // an error, but the point of this test is to ensure the later updates are still applied. let monitor_updates = nodes[1].chain_monitor.monitor_updates.lock().unwrap(); - let mut replay_update = monitor_updates.get(&channel.2).unwrap().iter().rev().skip(1).next().unwrap().clone(); + let mut replay_update = monitor_updates.get(&channel.2).unwrap().iter().rev().next().unwrap().clone(); assert_eq!(replay_update.updates.len(), 1); if let ChannelMonitorUpdateStep::LatestCounterpartyCommitmentTXInfo { .. } = replay_update.updates[0] { } else { panic!(); } diff --git a/lightning/src/chain/mod.rs b/lightning/src/chain/mod.rs index 236b10a7b19..736da139a55 100644 --- a/lightning/src/chain/mod.rs +++ b/lightning/src/chain/mod.rs @@ -192,10 +192,6 @@ pub enum ChannelMonitorUpdateStatus { /// have been successfully applied, a [`MonitorEvent::Completed`] can be used to restore the /// channel to an operational state. /// - /// Note that a given [`ChannelManager`] will *never* re-generate a [`ChannelMonitorUpdate`]. - /// If you return this error you must ensure that it is written to disk safely before writing - /// the latest [`ChannelManager`] state, or you should return [`PermanentFailure`] instead. - /// /// Even when a channel has been "frozen", updates to the [`ChannelMonitor`] can continue to /// occur (e.g. if an inbound HTLC which we forwarded was claimed upstream, resulting in us /// attempting to claim it on this channel) and those updates must still be persisted. @@ -208,49 +204,8 @@ pub enum ChannelMonitorUpdateStatus { /// remote location (with local copies persisted immediately), it is anticipated that all /// updates will return [`InProgress`] until the remote copies could be updated. /// - /// [`PermanentFailure`]: ChannelMonitorUpdateStatus::PermanentFailure /// [`InProgress`]: ChannelMonitorUpdateStatus::InProgress - /// [`ChannelManager`]: crate::ln::channelmanager::ChannelManager InProgress, - /// Used to indicate no further channel monitor updates will be allowed (likely a disk failure - /// or a remote copy of this [`ChannelMonitor`] is no longer reachable and thus not updatable). - /// - /// When this is returned, [`ChannelManager`] will force-close the channel but *not* broadcast - /// our current commitment transaction. This avoids a dangerous case where a local disk failure - /// (e.g. the Linux-default remounting of the disk as read-only) causes [`PermanentFailure`]s - /// for all monitor updates. If we were to broadcast our latest commitment transaction and then - /// restart, we could end up reading a previous [`ChannelMonitor`] and [`ChannelManager`], - /// revoking our now-broadcasted state before seeing it confirm and losing all our funds. - /// - /// Note that this is somewhat of a tradeoff - if the disk is really gone and we may have lost - /// the data permanently, we really should broadcast immediately. If the data can be recovered - /// with manual intervention, we'd rather close the channel, rejecting future updates to it, - /// and broadcast the latest state only if we have HTLCs to claim which are timing out (which - /// we do as long as blocks are connected). - /// - /// In order to broadcast the latest local commitment transaction, you'll need to call - /// [`ChannelMonitor::get_latest_holder_commitment_txn`] and broadcast the resulting - /// transactions once you've safely ensured no further channel updates can be generated by your - /// [`ChannelManager`]. - /// - /// Note that at least one final [`ChannelMonitorUpdate`] may still be provided, which must - /// still be processed by a running [`ChannelMonitor`]. This final update will mark the - /// [`ChannelMonitor`] as finalized, ensuring no further updates (e.g. revocation of the latest - /// commitment transaction) are allowed. - /// - /// Note that even if you return a [`PermanentFailure`] due to unavailability of secondary - /// [`ChannelMonitor`] copies, you should still make an attempt to store the update where - /// possible to ensure you can claim HTLC outputs on the latest commitment transaction - /// broadcasted later. - /// - /// In case of distributed watchtowers deployment, the new version must be written to disk, as - /// state may have been stored but rejected due to a block forcing a commitment broadcast. This - /// storage is used to claim outputs of rejected state confirmed onchain by another watchtower, - /// lagging behind on block processing. - /// - /// [`PermanentFailure`]: ChannelMonitorUpdateStatus::PermanentFailure - /// [`ChannelManager`]: crate::ln::channelmanager::ChannelManager - PermanentFailure, } /// The `Watch` trait defines behavior for watching on-chain activity pertaining to channels as @@ -262,16 +217,13 @@ pub enum ChannelMonitorUpdateStatus { /// requirements. /// /// Implementations **must** ensure that updates are successfully applied and persisted upon method -/// completion. If an update fails with a [`PermanentFailure`], then it must immediately shut down -/// without taking any further action such as persisting the current state. +/// completion. If an update will not succeed, then it must immediately shut down. /// /// If an implementation maintains multiple instances of a channel's monitor (e.g., by storing /// backup copies), then it must ensure that updates are applied across all instances. Otherwise, it /// could result in a revoked transaction being broadcast, allowing the counterparty to claim all /// funds in the channel. See [`ChannelMonitorUpdateStatus`] for more details about how to handle /// multiple instances. -/// -/// [`PermanentFailure`]: ChannelMonitorUpdateStatus::PermanentFailure pub trait Watch { /// Watches a channel identified by `funding_txo` using `monitor`. /// @@ -279,20 +231,30 @@ pub trait Watch { /// with any spends of outputs returned by [`get_outputs_to_watch`]. In practice, this means /// calling [`block_connected`] and [`block_disconnected`] on the monitor. /// - /// Note: this interface MUST error with [`ChannelMonitorUpdateStatus::PermanentFailure`] if - /// the given `funding_txo` has previously been registered via `watch_channel`. + /// A return of `Err(())` indicates that the channel should immediately be force-closed without + /// broadcasting the funding transaction. + /// + /// If the given `funding_txo` has previously been registered via `watch_channel`, `Err(())` + /// must be returned. /// /// [`get_outputs_to_watch`]: channelmonitor::ChannelMonitor::get_outputs_to_watch /// [`block_connected`]: channelmonitor::ChannelMonitor::block_connected /// [`block_disconnected`]: channelmonitor::ChannelMonitor::block_disconnected - fn watch_channel(&self, funding_txo: OutPoint, monitor: ChannelMonitor) -> ChannelMonitorUpdateStatus; + fn watch_channel(&self, funding_txo: OutPoint, monitor: ChannelMonitor) -> Result; /// Updates a channel identified by `funding_txo` by applying `update` to its monitor. /// - /// Implementations must call [`update_monitor`] with the given update. See - /// [`ChannelMonitorUpdateStatus`] for invariants around returning an error. + /// Implementations must call [`ChannelMonitor::update_monitor`] with the given update. This + /// may fail (returning an `Err(())`), in which case this should return + /// [`ChannelMonitorUpdateStatus::InProgress`] (and the update should never complete). This + /// generally implies the channel has been closed (either by the funding outpoint being spent + /// on-chain or the [`ChannelMonitor`] having decided to do so and broadcasted a transaction), + /// and the [`ChannelManager`] state will be updated once it sees the funding spend on-chain. + /// + /// If persistence fails, this should return [`ChannelMonitorUpdateStatus::InProgress`] and + /// the node should shut down immediately. /// - /// [`update_monitor`]: channelmonitor::ChannelMonitor::update_monitor + /// [`ChannelManager`]: crate::ln::channelmanager::ChannelManager fn update_channel(&self, funding_txo: OutPoint, update: &ChannelMonitorUpdate) -> ChannelMonitorUpdateStatus; /// Returns any monitor events since the last call. Subsequent calls must only return new diff --git a/lightning/src/ln/chanmon_update_fail_tests.rs b/lightning/src/ln/chanmon_update_fail_tests.rs index 33f4bbc8c26..dc9bbd3e404 100644 --- a/lightning/src/ln/chanmon_update_fail_tests.rs +++ b/lightning/src/ln/chanmon_update_fail_tests.rs @@ -37,43 +37,6 @@ use bitcoin::hashes::Hash; use crate::prelude::*; use crate::sync::{Arc, Mutex}; -#[test] -fn test_simple_monitor_permanent_update_fail() { - // Test that we handle a simple permanent monitor update failure - let chanmon_cfgs = create_chanmon_cfgs(2); - let node_cfgs = create_node_cfgs(2, &chanmon_cfgs); - let node_chanmgrs = create_node_chanmgrs(2, &node_cfgs, &[None, None]); - let mut nodes = create_network(2, &node_cfgs, &node_chanmgrs); - create_announced_chan_between_nodes(&nodes, 0, 1); - - let (route, payment_hash_1, _, payment_secret_1) = get_route_and_payment_hash!(&nodes[0], nodes[1], 1000000); - chanmon_cfgs[0].persister.set_update_ret(ChannelMonitorUpdateStatus::PermanentFailure); - unwrap_send_err!(nodes[0].node.send_payment_with_route(&route, payment_hash_1, - RecipientOnionFields::secret_only(payment_secret_1), PaymentId(payment_hash_1.0) - ), true, APIError::ChannelUnavailable {..}, {}); - check_added_monitors!(nodes[0], 2); - - let events_1 = nodes[0].node.get_and_clear_pending_msg_events(); - assert_eq!(events_1.len(), 2); - match events_1[0] { - MessageSendEvent::BroadcastChannelUpdate { .. } => {}, - _ => panic!("Unexpected event"), - }; - match events_1[1] { - MessageSendEvent::HandleError { node_id, .. } => assert_eq!(node_id, nodes[1].node.get_our_node_id()), - _ => panic!("Unexpected event"), - }; - - assert!(nodes[0].tx_broadcaster.txn_broadcasted.lock().unwrap().is_empty()); - - // TODO: Once we hit the chain with the failure transaction we should check that we get a - // PaymentPathFailed event - - assert_eq!(nodes[0].node.list_channels().len(), 0); - check_closed_event!(nodes[0], 1, ClosureReason::ProcessingError { err: "ChannelMonitor storage failure".to_string() }, - [nodes[1].node.get_our_node_id()], 100000); -} - #[test] fn test_monitor_and_persister_update_fail() { // Test that if both updating the `ChannelMonitor` and persisting the updated @@ -117,14 +80,11 @@ fn test_monitor_and_persister_update_fail() { new_monitor }; let chain_mon = test_utils::TestChainMonitor::new(Some(&chain_source), &tx_broadcaster, &logger, &chanmon_cfgs[0].fee_estimator, &persister, &node_cfgs[0].keys_manager); - assert_eq!(chain_mon.watch_channel(outpoint, new_monitor), ChannelMonitorUpdateStatus::Completed); + assert_eq!(chain_mon.watch_channel(outpoint, new_monitor), Ok(ChannelMonitorUpdateStatus::Completed)); chain_mon }; chain_mon.chain_monitor.block_connected(&create_dummy_block(BlockHash::all_zeros(), 42, Vec::new()), 200); - // Set the persister's return value to be a InProgress. - persister.set_update_ret(ChannelMonitorUpdateStatus::InProgress); - // Try to update ChannelMonitor nodes[1].node.claim_funds(preimage); expect_payment_claimed!(nodes[1], payment_hash, 9_000_000); @@ -133,17 +93,21 @@ fn test_monitor_and_persister_update_fail() { let updates = get_htlc_update_msgs!(nodes[1], nodes[0].node.get_our_node_id()); assert_eq!(updates.update_fulfill_htlcs.len(), 1); nodes[0].node.handle_update_fulfill_htlc(&nodes[1].node.get_our_node_id(), &updates.update_fulfill_htlcs[0]); + { let mut node_0_per_peer_lock; let mut node_0_peer_state_lock; if let ChannelPhase::Funded(ref mut channel) = get_channel_ref!(nodes[0], nodes[1], node_0_per_peer_lock, node_0_peer_state_lock, chan.2) { if let Ok(Some(update)) = channel.commitment_signed(&updates.commitment_signed, &node_cfgs[0].logger) { - // Check that even though the persister is returning a InProgress, - // because the update is bogus, ultimately the error that's returned - // should be a PermanentFailure. - if let ChannelMonitorUpdateStatus::PermanentFailure = chain_mon.chain_monitor.update_channel(outpoint, &update) {} else { panic!("Expected monitor error to be permanent"); } - logger.assert_log_regex("lightning::chain::chainmonitor", regex::Regex::new("Persistence of ChannelMonitorUpdate for channel [0-9a-f]* in progress").unwrap(), 1); - assert_eq!(nodes[0].chain_monitor.update_channel(outpoint, &update), ChannelMonitorUpdateStatus::Completed); + // Check that the persister returns InProgress (and will never actually complete) + // as the monitor update errors. + if let ChannelMonitorUpdateStatus::InProgress = chain_mon.chain_monitor.update_channel(outpoint, &update) {} else { panic!("Expected monitor paused"); } + logger.assert_log_regex("lightning::chain::chainmonitor", regex::Regex::new("Failed to update ChannelMonitor for channel [0-9a-f]*.").unwrap(), 1); + + // Apply the monitor update to the original ChainMonitor, ensuring the + // ChannelManager and ChannelMonitor aren't out of sync. + assert_eq!(nodes[0].chain_monitor.update_channel(outpoint, &update), + ChannelMonitorUpdateStatus::Completed); } else { assert!(false); } } else { assert!(false); @@ -151,8 +115,7 @@ fn test_monitor_and_persister_update_fail() { } check_added_monitors!(nodes[0], 1); - let events = nodes[0].node.get_and_clear_pending_events(); - assert_eq!(events.len(), 1); + expect_payment_sent(&nodes[0], preimage, None, false, false); } fn do_test_simple_monitor_temporary_update_fail(disconnect: bool) { @@ -2675,68 +2638,6 @@ fn test_temporary_error_during_shutdown() { check_closed_event!(nodes[0], 1, ClosureReason::CooperativeClosure, [nodes[1].node.get_our_node_id()], 100000); } -#[test] -fn test_permanent_error_during_sending_shutdown() { - // Test that permanent failures when updating the monitor's shutdown script result in a force - // close when initiating a cooperative close. - let mut config = test_default_channel_config(); - config.channel_handshake_config.commit_upfront_shutdown_pubkey = false; - - let chanmon_cfgs = create_chanmon_cfgs(2); - let node_cfgs = create_node_cfgs(2, &chanmon_cfgs); - let node_chanmgrs = create_node_chanmgrs(2, &node_cfgs, &[Some(config), None]); - let mut nodes = create_network(2, &node_cfgs, &node_chanmgrs); - - let channel_id = create_announced_chan_between_nodes(&nodes, 0, 1).2; - chanmon_cfgs[0].persister.set_update_ret(ChannelMonitorUpdateStatus::PermanentFailure); - - assert!(nodes[0].node.close_channel(&channel_id, &nodes[1].node.get_our_node_id()).is_ok()); - - // We always send the `shutdown` response when initiating a shutdown, even if we immediately - // close the channel thereafter. - let msg_events = nodes[0].node.get_and_clear_pending_msg_events(); - assert_eq!(msg_events.len(), 3); - if let MessageSendEvent::SendShutdown { .. } = msg_events[0] {} else { panic!(); } - if let MessageSendEvent::BroadcastChannelUpdate { .. } = msg_events[1] {} else { panic!(); } - if let MessageSendEvent::HandleError { .. } = msg_events[2] {} else { panic!(); } - - check_added_monitors!(nodes[0], 2); - check_closed_event!(nodes[0], 1, ClosureReason::ProcessingError { err: "ChannelMonitor storage failure".to_string() }, - [nodes[1].node.get_our_node_id()], 100000); -} - -#[test] -fn test_permanent_error_during_handling_shutdown() { - // Test that permanent failures when updating the monitor's shutdown script result in a force - // close when handling a cooperative close. - let mut config = test_default_channel_config(); - config.channel_handshake_config.commit_upfront_shutdown_pubkey = false; - - let chanmon_cfgs = create_chanmon_cfgs(2); - let node_cfgs = create_node_cfgs(2, &chanmon_cfgs); - let node_chanmgrs = create_node_chanmgrs(2, &node_cfgs, &[None, Some(config)]); - let mut nodes = create_network(2, &node_cfgs, &node_chanmgrs); - - let channel_id = create_announced_chan_between_nodes(&nodes, 0, 1).2; - chanmon_cfgs[1].persister.set_update_ret(ChannelMonitorUpdateStatus::PermanentFailure); - - assert!(nodes[0].node.close_channel(&channel_id, &nodes[1].node.get_our_node_id()).is_ok()); - let shutdown = get_event_msg!(nodes[0], MessageSendEvent::SendShutdown, nodes[1].node.get_our_node_id()); - nodes[1].node.handle_shutdown(&nodes[0].node.get_our_node_id(), &shutdown); - - // We always send the `shutdown` response when receiving a shutdown, even if we immediately - // close the channel thereafter. - let msg_events = nodes[1].node.get_and_clear_pending_msg_events(); - assert_eq!(msg_events.len(), 3); - if let MessageSendEvent::SendShutdown { .. } = msg_events[0] {} else { panic!(); } - if let MessageSendEvent::BroadcastChannelUpdate { .. } = msg_events[1] {} else { panic!(); } - if let MessageSendEvent::HandleError { .. } = msg_events[2] {} else { panic!(); } - - check_added_monitors!(nodes[1], 2); - check_closed_event!(nodes[1], 1, ClosureReason::ProcessingError { err: "ChannelMonitor storage failure".to_string() }, - [nodes[0].node.get_our_node_id()], 100000); -} - #[test] fn double_temp_error() { // Test that it's OK to have multiple `ChainMonitor::update_channel` calls fail in a row. diff --git a/lightning/src/ln/channelmanager.rs b/lightning/src/ln/channelmanager.rs index 0cadbd41a29..35735c38926 100644 --- a/lightning/src/ln/channelmanager.rs +++ b/lightning/src/ln/channelmanager.rs @@ -2051,17 +2051,6 @@ macro_rules! handle_new_monitor_update { &$chan.context.channel_id()); Ok(false) }, - ChannelMonitorUpdateStatus::PermanentFailure => { - log_error!($self.logger, "Closing channel {} due to monitor update ChannelMonitorUpdateStatus::PermanentFailure", - &$chan.context.channel_id()); - update_maps_on_chan_removal!($self, &$chan.context); - let res = Err(MsgHandleErrInternal::from_finish_shutdown( - "ChannelMonitor storage failure".to_owned(), $chan.context.channel_id(), - $chan.context.get_user_id(), $chan.context.force_shutdown(false), - $self.get_channel_update_for_broadcast(&$chan).ok(), $chan.context.get_value_satoshis())); - $remove; - res - }, ChannelMonitorUpdateStatus::Completed => { $completed; Ok(true) @@ -5919,47 +5908,55 @@ where Err(MsgHandleErrInternal::send_err_msg_no_close("Already had channel with the new channel_id".to_owned(), funding_msg.channel_id)) }, hash_map::Entry::Vacant(e) => { - match self.id_to_peer.lock().unwrap().entry(chan.context.channel_id()) { + let mut id_to_peer_lock = self.id_to_peer.lock().unwrap(); + match id_to_peer_lock.entry(chan.context.channel_id()) { hash_map::Entry::Occupied(_) => { return Err(MsgHandleErrInternal::send_err_msg_no_close( "The funding_created message had the same funding_txid as an existing channel - funding is not possible".to_owned(), funding_msg.channel_id)) }, hash_map::Entry::Vacant(i_e) => { - i_e.insert(chan.context.get_counterparty_node_id()); - } - } - - // There's no problem signing a counterparty's funding transaction if our monitor - // hasn't persisted to disk yet - we can't lose money on a transaction that we haven't - // accepted payment from yet. We do, however, need to wait to send our channel_ready - // until we have persisted our monitor. - let new_channel_id = funding_msg.channel_id; - peer_state.pending_msg_events.push(events::MessageSendEvent::SendFundingSigned { - node_id: counterparty_node_id.clone(), - msg: funding_msg, - }); + let monitor_res = self.chain_monitor.watch_channel(monitor.get_funding_txo().0, monitor); + if let Ok(persist_state) = monitor_res { + i_e.insert(chan.context.get_counterparty_node_id()); + mem::drop(id_to_peer_lock); + + // There's no problem signing a counterparty's funding transaction if our monitor + // hasn't persisted to disk yet - we can't lose money on a transaction that we haven't + // accepted payment from yet. We do, however, need to wait to send our channel_ready + // until we have persisted our monitor. + let new_channel_id = funding_msg.channel_id; + peer_state.pending_msg_events.push(events::MessageSendEvent::SendFundingSigned { + node_id: counterparty_node_id.clone(), + msg: funding_msg, + }); - let monitor_res = self.chain_monitor.watch_channel(monitor.get_funding_txo().0, monitor); - - if let ChannelPhase::Funded(chan) = e.insert(ChannelPhase::Funded(chan)) { - let mut res = handle_new_monitor_update!(self, monitor_res, peer_state_lock, peer_state, - per_peer_state, chan, MANUALLY_REMOVING_INITIAL_MONITOR, - { peer_state.channel_by_id.remove(&new_channel_id) }); - - // Note that we reply with the new channel_id in error messages if we gave up on the - // channel, not the temporary_channel_id. This is compatible with ourselves, but the - // spec is somewhat ambiguous here. Not a huge deal since we'll send error messages for - // any messages referencing a previously-closed channel anyway. - // We do not propagate the monitor update to the user as it would be for a monitor - // that we didn't manage to store (and that we don't care about - we don't respond - // with the funding_signed so the channel can never go on chain). - if let Err(MsgHandleErrInternal { shutdown_finish: Some((res, _)), .. }) = &mut res { - res.0 = None; + if let ChannelPhase::Funded(chan) = e.insert(ChannelPhase::Funded(chan)) { + let mut res = handle_new_monitor_update!(self, persist_state, peer_state_lock, peer_state, + per_peer_state, chan, MANUALLY_REMOVING_INITIAL_MONITOR, + { peer_state.channel_by_id.remove(&new_channel_id) }); + + // Note that we reply with the new channel_id in error messages if we gave up on the + // channel, not the temporary_channel_id. This is compatible with ourselves, but the + // spec is somewhat ambiguous here. Not a huge deal since we'll send error messages for + // any messages referencing a previously-closed channel anyway. + // We do not propagate the monitor update to the user as it would be for a monitor + // that we didn't manage to store (and that we don't care about - we don't respond + // with the funding_signed so the channel can never go on chain). + if let Err(MsgHandleErrInternal { shutdown_finish: Some((res, _)), .. }) = &mut res { + res.0 = None; + } + res.map(|_| ()) + } else { + unreachable!("This must be a funded channel as we just inserted it."); + } + } else { + log_error!(self.logger, "Persisting initial ChannelMonitor failed, implying the funding outpoint was duplicated"); + return Err(MsgHandleErrInternal::send_err_msg_no_close( + "The funding_created message had the same funding_txid as an existing channel - funding is not possible".to_owned(), + funding_msg.channel_id)); + } } - res.map(|_| ()) - } else { - unreachable!("This must be a funded channel as we just inserted it."); } } } @@ -5982,17 +5979,20 @@ where ChannelPhase::Funded(ref mut chan) => { let monitor = try_chan_phase_entry!(self, chan.funding_signed(&msg, best_block, &self.signer_provider, &self.logger), chan_phase_entry); - let update_res = self.chain_monitor.watch_channel(chan.context.get_funding_txo().unwrap(), monitor); - let mut res = handle_new_monitor_update!(self, update_res, peer_state_lock, peer_state, per_peer_state, chan_phase_entry, INITIAL_MONITOR); - if let Err(MsgHandleErrInternal { ref mut shutdown_finish, .. }) = res { - // We weren't able to watch the channel to begin with, so no updates should be made on - // it. Previously, full_stack_target found an (unreachable) panic when the - // monitor update contained within `shutdown_finish` was applied. - if let Some((ref mut shutdown_finish, _)) = shutdown_finish { - shutdown_finish.0.take(); + if let Ok(persist_status) = self.chain_monitor.watch_channel(chan.context.get_funding_txo().unwrap(), monitor) { + let mut res = handle_new_monitor_update!(self, persist_status, peer_state_lock, peer_state, per_peer_state, chan_phase_entry, INITIAL_MONITOR); + if let Err(MsgHandleErrInternal { ref mut shutdown_finish, .. }) = res { + // We weren't able to watch the channel to begin with, so no updates should be made on + // it. Previously, full_stack_target found an (unreachable) panic when the + // monitor update contained within `shutdown_finish` was applied. + if let Some((ref mut shutdown_finish, _)) = shutdown_finish { + shutdown_finish.0.take(); + } } + res.map(|_| ()) + } else { + try_chan_phase_entry!(self, Err(ChannelError::Close("Channel funding outpoint was a duplicate".to_owned())), chan_phase_entry) } - res.map(|_| ()) }, _ => { return Err(MsgHandleErrInternal::send_err_msg_no_close("Failed to find corresponding channel".to_owned(), msg.channel_id)); diff --git a/lightning/src/ln/functional_test_utils.rs b/lightning/src/ln/functional_test_utils.rs index f6684485dba..2c4c3c0c267 100644 --- a/lightning/src/ln/functional_test_utils.rs +++ b/lightning/src/ln/functional_test_utils.rs @@ -578,7 +578,7 @@ impl<'a, 'b, 'c> Drop for Node<'a, 'b, 'c> { let chain_source = test_utils::TestChainSource::new(Network::Testnet); let chain_monitor = test_utils::TestChainMonitor::new(Some(&chain_source), &broadcaster, &self.logger, &feeest, &persister, &self.keys_manager); for deserialized_monitor in deserialized_monitors.drain(..) { - if chain_monitor.watch_channel(deserialized_monitor.get_funding_txo().0, deserialized_monitor) != ChannelMonitorUpdateStatus::Completed { + if chain_monitor.watch_channel(deserialized_monitor.get_funding_txo().0, deserialized_monitor) != Ok(ChannelMonitorUpdateStatus::Completed) { panic!(); } } @@ -977,7 +977,7 @@ pub fn _reload_node<'a, 'b, 'c>(node: &'a Node<'a, 'b, 'c>, default_config: User for monitor in monitors_read.drain(..) { assert_eq!(node.chain_monitor.watch_channel(monitor.get_funding_txo().0, monitor), - ChannelMonitorUpdateStatus::Completed); + Ok(ChannelMonitorUpdateStatus::Completed)); check_added_monitors!(node, 1); } diff --git a/lightning/src/ln/functional_tests.rs b/lightning/src/ln/functional_tests.rs index b252a352c48..cf27d6787c4 100644 --- a/lightning/src/ln/functional_tests.rs +++ b/lightning/src/ln/functional_tests.rs @@ -2419,7 +2419,7 @@ fn channel_monitor_network_test() { assert_eq!(nodes[4].node.list_channels().len(), 0); assert_eq!(nodes[3].chain_monitor.chain_monitor.watch_channel(OutPoint { txid: chan_3.3.txid(), index: 0 }, chan_3_mon), - ChannelMonitorUpdateStatus::Completed); + Ok(ChannelMonitorUpdateStatus::Completed)); check_closed_event!(nodes[3], 1, ClosureReason::CommitmentTxConfirmed, [nodes[4].node.get_our_node_id()], 100000); check_closed_event!(nodes[4], 1, ClosureReason::CommitmentTxConfirmed, [nodes[3].node.get_our_node_id()], 100000); } @@ -8453,7 +8453,7 @@ fn test_update_err_monitor_lockdown() { new_monitor }; let watchtower = test_utils::TestChainMonitor::new(Some(&chain_source), &chanmon_cfgs[0].tx_broadcaster, &logger, &chanmon_cfgs[0].fee_estimator, &persister, &node_cfgs[0].keys_manager); - assert_eq!(watchtower.watch_channel(outpoint, new_monitor), ChannelMonitorUpdateStatus::Completed); + assert_eq!(watchtower.watch_channel(outpoint, new_monitor), Ok(ChannelMonitorUpdateStatus::Completed)); watchtower }; let block = create_dummy_block(BlockHash::all_zeros(), 42, Vec::new()); @@ -8475,7 +8475,7 @@ fn test_update_err_monitor_lockdown() { let mut node_0_peer_state_lock; if let ChannelPhase::Funded(ref mut channel) = get_channel_ref!(nodes[0], nodes[1], node_0_per_peer_lock, node_0_peer_state_lock, chan_1.2) { if let Ok(Some(update)) = channel.commitment_signed(&updates.commitment_signed, &node_cfgs[0].logger) { - assert_eq!(watchtower.chain_monitor.update_channel(outpoint, &update), ChannelMonitorUpdateStatus::PermanentFailure); + assert_eq!(watchtower.chain_monitor.update_channel(outpoint, &update), ChannelMonitorUpdateStatus::InProgress); assert_eq!(nodes[0].chain_monitor.update_channel(outpoint, &update), ChannelMonitorUpdateStatus::Completed); } else { assert!(false); } } else { @@ -8526,7 +8526,7 @@ fn test_concurrent_monitor_claim() { new_monitor }; let watchtower = test_utils::TestChainMonitor::new(Some(&chain_source), &alice_broadcaster, &logger, &chanmon_cfgs[0].fee_estimator, &persister, &node_cfgs[0].keys_manager); - assert_eq!(watchtower.watch_channel(outpoint, new_monitor), ChannelMonitorUpdateStatus::Completed); + assert_eq!(watchtower.watch_channel(outpoint, new_monitor), Ok(ChannelMonitorUpdateStatus::Completed)); watchtower }; let block = create_dummy_block(BlockHash::all_zeros(), 42, Vec::new()); @@ -8557,7 +8557,7 @@ fn test_concurrent_monitor_claim() { new_monitor }; let watchtower = test_utils::TestChainMonitor::new(Some(&chain_source), &bob_broadcaster, &logger, &chanmon_cfgs[0].fee_estimator, &persister, &node_cfgs[0].keys_manager); - assert_eq!(watchtower.watch_channel(outpoint, new_monitor), ChannelMonitorUpdateStatus::Completed); + assert_eq!(watchtower.watch_channel(outpoint, new_monitor), Ok(ChannelMonitorUpdateStatus::Completed)); watchtower }; watchtower_bob.chain_monitor.block_connected(&create_dummy_block(BlockHash::all_zeros(), 42, Vec::new()), HTLC_TIMEOUT_BROADCAST - 1); @@ -8577,7 +8577,7 @@ fn test_concurrent_monitor_claim() { if let ChannelPhase::Funded(ref mut channel) = get_channel_ref!(nodes[0], nodes[1], node_0_per_peer_lock, node_0_peer_state_lock, chan_1.2) { if let Ok(Some(update)) = channel.commitment_signed(&updates.commitment_signed, &node_cfgs[0].logger) { // Watchtower Alice should already have seen the block and reject the update - assert_eq!(watchtower_alice.chain_monitor.update_channel(outpoint, &update), ChannelMonitorUpdateStatus::PermanentFailure); + assert_eq!(watchtower_alice.chain_monitor.update_channel(outpoint, &update), ChannelMonitorUpdateStatus::InProgress); assert_eq!(watchtower_bob.chain_monitor.update_channel(outpoint, &update), ChannelMonitorUpdateStatus::Completed); assert_eq!(nodes[0].chain_monitor.update_channel(outpoint, &update), ChannelMonitorUpdateStatus::Completed); } else { assert!(false); } diff --git a/lightning/src/ln/reload_tests.rs b/lightning/src/ln/reload_tests.rs index c3a428ae014..fa08fba99fa 100644 --- a/lightning/src/ln/reload_tests.rs +++ b/lightning/src/ln/reload_tests.rs @@ -448,7 +448,7 @@ fn test_manager_serialize_deserialize_inconsistent_monitor() { for monitor in node_0_monitors.drain(..) { assert_eq!(nodes[0].chain_monitor.watch_channel(monitor.get_funding_txo().0, monitor), - ChannelMonitorUpdateStatus::Completed); + Ok(ChannelMonitorUpdateStatus::Completed)); check_added_monitors!(nodes[0], 1); } nodes[0].node = &nodes_0_deserialized; diff --git a/lightning/src/util/persist.rs b/lightning/src/util/persist.rs index 372a094a931..af66e1233ab 100644 --- a/lightning/src/util/persist.rs +++ b/lightning/src/util/persist.rs @@ -174,8 +174,8 @@ impl<'a, A: KVStore, M: Deref, T: Deref, ES: Deref, NS: Deref, SP: Deref, F: Der impl Persist for K { // TODO: We really need a way for the persister to inform the user that its time to crash/shut // down once these start returning failure. - // A PermanentFailure implies we should probably just shut down the node since we're - // force-closing channels without even broadcasting! + // An InProgress result implies we should probably just shut down the node since we're not + // retrying persistence! fn persist_new_channel(&self, funding_txo: OutPoint, monitor: &ChannelMonitor, _update_id: MonitorUpdateId) -> chain::ChannelMonitorUpdateStatus { let key = format!("{}_{}", funding_txo.txid.to_hex(), funding_txo.index); @@ -185,7 +185,7 @@ impl Persist chain::ChannelMonitorUpdateStatus::Completed, - Err(_) => chain::ChannelMonitorUpdateStatus::PermanentFailure, + Err(_) => chain::ChannelMonitorUpdateStatus::InProgress } } @@ -197,7 +197,7 @@ impl Persist chain::ChannelMonitorUpdateStatus::Completed, - Err(_) => chain::ChannelMonitorUpdateStatus::PermanentFailure, + Err(_) => chain::ChannelMonitorUpdateStatus::InProgress } } } diff --git a/lightning/src/util/test_utils.rs b/lightning/src/util/test_utils.rs index 785b40befd4..d53cd39b119 100644 --- a/lightning/src/util/test_utils.rs +++ b/lightning/src/util/test_utils.rs @@ -225,7 +225,7 @@ impl<'a> TestChainMonitor<'a> { } } impl<'a> chain::Watch for TestChainMonitor<'a> { - fn watch_channel(&self, funding_txo: OutPoint, monitor: channelmonitor::ChannelMonitor) -> chain::ChannelMonitorUpdateStatus { + fn watch_channel(&self, funding_txo: OutPoint, monitor: channelmonitor::ChannelMonitor) -> Result { // At every point where we get a monitor update, we should be able to send a useful monitor // to a watchtower and disk... let mut w = TestVecWriter(Vec::new()); From f24502e9868fa7fc37541896bd169fa10d13636c Mon Sep 17 00:00:00 2001 From: Matt Corallo Date: Wed, 30 Aug 2023 18:16:03 +0000 Subject: [PATCH 03/11] Drop `channel_perm_failed` tracking in `ChainMonitor` Now that `PermanentFailure` is not a possible return value, we can simply remove handling of it in `ChannelMonitor`. --- lightning/src/chain/chainmonitor.rs | 25 ++----------------------- 1 file changed, 2 insertions(+), 23 deletions(-) diff --git a/lightning/src/chain/chainmonitor.rs b/lightning/src/chain/chainmonitor.rs index e17e2dbd34b..09f72c59489 100644 --- a/lightning/src/chain/chainmonitor.rs +++ b/lightning/src/chain/chainmonitor.rs @@ -179,12 +179,6 @@ struct MonitorHolder { /// the ChannelManager re-adding the same payment entry, before the same block is replayed, /// resulting in a duplicate PaymentSent event. pending_monitor_updates: Mutex>, - /// When the user returns a PermanentFailure error from an update_persisted_channel call during - /// block processing, we inform the ChannelManager that the channel should be closed - /// asynchronously. In order to ensure no further changes happen before the ChannelManager has - /// processed the closure event, we set this to true and return PermanentFailure for any other - /// chain::Watch events. - channel_perm_failed: AtomicBool, /// The last block height at which no [`UpdateOrigin::ChainSync`] monitor updates were present /// in `pending_monitor_updates`. /// If it's been more than [`LATENCY_GRACE_PERIOD_BLOCKS`] since we started waiting on a chain @@ -485,9 +479,8 @@ where C::Target: chain::Filter, // `MonitorEvent`s from the monitor back to the `ChannelManager` until they // complete. let monitor_is_pending_updates = monitor_data.has_pending_offchain_updates(&pending_monitor_updates); - if monitor_is_pending_updates || monitor_data.channel_perm_failed.load(Ordering::Acquire) { - // If there are still monitor updates pending (or an old monitor update - // finished after a later one perm-failed), we cannot yet construct an + if monitor_is_pending_updates { + // If there are still monitor updates pending, we cannot yet construct a // Completed event. return Ok(()); } @@ -695,7 +688,6 @@ where C::Target: chain::Filter, entry.insert(MonitorHolder { monitor, pending_monitor_updates: Mutex::new(pending_monitor_updates), - channel_perm_failed: AtomicBool::new(false), last_chain_persist_height: AtomicUsize::new(self.highest_chain_height.load(Ordering::Acquire)), }); Ok(persist_res) @@ -741,8 +733,6 @@ where C::Target: chain::Filter, } if update_res.is_err() { ChannelMonitorUpdateStatus::InProgress - } else if monitor_state.channel_perm_failed.load(Ordering::Acquire) { - ChannelMonitorUpdateStatus::InProgress } else { persist_res } @@ -760,17 +750,6 @@ where C::Target: chain::Filter, { log_debug!(self.logger, "A Channel Monitor sync is still in progress, refusing to provide monitor events!"); } else { - if monitor_state.channel_perm_failed.load(Ordering::Acquire) { - // If a `UpdateOrigin::ChainSync` persistence failed with `PermanantFailure`, - // we don't really know if the latest `ChannelMonitor` state is on disk or not. - // We're supposed to hold monitor updates until the latest state is on disk to - // avoid duplicate events, but the user told us persistence is screw-y and may - // not complete. We can't hold events forever because we may learn some payment - // preimage, so instead we just log and hope the user complied with the - // `PermanentFailure` requirements of having at least the local-disk copy - // updated. - log_info!(self.logger, "A Channel Monitor sync returned PermanentFailure. Returning monitor events but duplicate events may appear after reload!"); - } if is_pending_monitor_update { log_error!(self.logger, "A ChannelMonitor sync took longer than {} blocks to complete.", LATENCY_GRACE_PERIOD_BLOCKS); log_error!(self.logger, " To avoid funds-loss, we are allowing monitor updates to be released."); From 6e115db22b1e1f89f62c958af1300ec0a47ef27a Mon Sep 17 00:00:00 2001 From: Matt Corallo Date: Wed, 30 Aug 2023 18:19:35 +0000 Subject: [PATCH 04/11] Drop `ChannelMonitorUpdate::UpdateFailed` as its now unused --- lightning/src/chain/chainmonitor.rs | 2 +- lightning/src/chain/channelmonitor.rs | 9 +++------ lightning/src/ln/channelmanager.rs | 10 ++-------- 3 files changed, 6 insertions(+), 15 deletions(-) diff --git a/lightning/src/chain/chainmonitor.rs b/lightning/src/chain/chainmonitor.rs index 09f72c59489..4e349799d1c 100644 --- a/lightning/src/chain/chainmonitor.rs +++ b/lightning/src/chain/chainmonitor.rs @@ -44,7 +44,7 @@ use crate::prelude::*; use crate::sync::{RwLock, RwLockReadGuard, Mutex, MutexGuard}; use core::iter::FromIterator; use core::ops::Deref; -use core::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; +use core::sync::atomic::{AtomicUsize, Ordering}; use bitcoin::secp256k1::PublicKey; #[derive(Clone, Copy, Hash, PartialEq, Eq)] diff --git a/lightning/src/chain/channelmonitor.rs b/lightning/src/chain/channelmonitor.rs index 16a9901e443..3e03c2fd541 100644 --- a/lightning/src/chain/channelmonitor.rs +++ b/lightning/src/chain/channelmonitor.rs @@ -150,13 +150,10 @@ pub enum MonitorEvent { /// same [`ChannelMonitor`] have been applied and persisted. monitor_update_id: u64, }, - - /// Indicates a [`ChannelMonitor`] update has failed. - UpdateFailed(OutPoint), } impl_writeable_tlv_based_enum_upgradable!(MonitorEvent, - // Note that Completed and UpdateFailed are currently never serialized to disk as they are - // generated only in ChainMonitor + // Note that Completed is currently never serialized to disk as it is generated only in + // ChainMonitor. (0, Completed) => { (0, funding_txo, required), (2, monitor_update_id, required), @@ -164,7 +161,7 @@ impl_writeable_tlv_based_enum_upgradable!(MonitorEvent, ; (2, HTLCEvent), (4, CommitmentTxConfirmed), - (6, UpdateFailed), + // 6 was `UpdateFailed` until LDK 0.0.117 ); /// Simple structure sent back by `chain::Watch` when an HTLC from a forward channel is detected on diff --git a/lightning/src/ln/channelmanager.rs b/lightning/src/ln/channelmanager.rs index 35735c38926..b068b7a7496 100644 --- a/lightning/src/ln/channelmanager.rs +++ b/lightning/src/ln/channelmanager.rs @@ -6729,8 +6729,7 @@ where self.fail_htlc_backwards_internal(&htlc_update.source, &htlc_update.payment_hash, &reason, receiver); } }, - MonitorEvent::CommitmentTxConfirmed(funding_outpoint) | - MonitorEvent::UpdateFailed(funding_outpoint) => { + MonitorEvent::CommitmentTxConfirmed(funding_outpoint) => { let counterparty_node_id_opt = match counterparty_node_id { Some(cp_id) => Some(cp_id), None => { @@ -6754,12 +6753,7 @@ where msg: update }); } - let reason = if let MonitorEvent::UpdateFailed(_) = monitor_event { - ClosureReason::ProcessingError { err: "Failed to persist ChannelMonitor update during chain sync".to_string() } - } else { - ClosureReason::CommitmentTxConfirmed - }; - self.issue_channel_close_events(&chan.context, reason); + self.issue_channel_close_events(&chan.context, ClosureReason::CommitmentTxConfirmed); pending_msg_events.push(events::MessageSendEvent::HandleError { node_id: chan.context.get_counterparty_node_id(), action: msgs::ErrorAction::SendErrorMessage { From a96e2fe144383ea6fd670153fb895ee07a3245ef Mon Sep 17 00:00:00 2001 From: Matt Corallo Date: Wed, 30 Aug 2023 18:22:33 +0000 Subject: [PATCH 05/11] Rename `MonitorEvent::CommitmentTxConfirmed` to `HolderForceClosed` The `MonitorEvent::CommitmentTxConfirmed` has always been a result of us force-closing the channel, not the counterparty doing so. Thus, it was always a bit of a misnomer. Worse, it carried over into the channel's `ClosureReason` in the event API. Here we simply rename it and use the proper `ClosureReason`. --- lightning/src/chain/channelmonitor.rs | 16 ++++++++-------- lightning/src/ln/channelmanager.rs | 4 ++-- lightning/src/ln/functional_tests.rs | 12 ++++++------ lightning/src/ln/monitor_tests.rs | 4 ++-- lightning/src/ln/reorg_tests.rs | 2 +- 5 files changed, 19 insertions(+), 19 deletions(-) diff --git a/lightning/src/chain/channelmonitor.rs b/lightning/src/chain/channelmonitor.rs index 3e03c2fd541..3f9c83bb543 100644 --- a/lightning/src/chain/channelmonitor.rs +++ b/lightning/src/chain/channelmonitor.rs @@ -134,7 +134,7 @@ pub enum MonitorEvent { HTLCEvent(HTLCUpdate), /// A monitor event that the Channel's commitment transaction was confirmed. - CommitmentTxConfirmed(OutPoint), + HolderForceClosed(OutPoint), /// Indicates a [`ChannelMonitor`] update has completed. See /// [`ChannelMonitorUpdateStatus::InProgress`] for more information on how this is used. @@ -160,7 +160,7 @@ impl_writeable_tlv_based_enum_upgradable!(MonitorEvent, }, ; (2, HTLCEvent), - (4, CommitmentTxConfirmed), + (4, HolderForceClosed), // 6 was `UpdateFailed` until LDK 0.0.117 ); @@ -1031,7 +1031,7 @@ impl Writeable for ChannelMonitorImpl true, - MonitorEvent::CommitmentTxConfirmed(_) => true, + MonitorEvent::HolderForceClosed(_) => true, _ => false, }).count() as u64).to_be_bytes())?; for event in self.pending_monitor_events.iter() { @@ -1040,7 +1040,7 @@ impl Writeable for ChannelMonitorImpl 1u8.write(writer)?, + MonitorEvent::HolderForceClosed(_) => 1u8.write(writer)?, _ => {}, // Covered in the TLV writes below } } @@ -2526,7 +2526,7 @@ impl ChannelMonitorImpl { txs.push(tx); } broadcaster.broadcast_transactions(&txs); - self.pending_monitor_events.push(MonitorEvent::CommitmentTxConfirmed(self.funding_info.0)); + self.pending_monitor_events.push(MonitorEvent::HolderForceClosed(self.funding_info.0)); } pub fn update_monitor(&mut self, updates: &ChannelMonitorUpdate, broadcaster: &B, fee_estimator: F, logger: &L) -> Result<(), ()> @@ -2639,7 +2639,7 @@ impl ChannelMonitorImpl { log_error!(logger, " in channel monitor for channel {}!", &self.funding_info.0.to_channel_id()); log_error!(logger, " Read the docs for ChannelMonitor::get_latest_holder_commitment_txn and take manual action!"); } else { - // If we generated a MonitorEvent::CommitmentTxConfirmed, the ChannelManager + // If we generated a MonitorEvent::HolderForceClosed, the ChannelManager // will still give us a ChannelForceClosed event with !should_broadcast, but we // shouldn't print the scary warning above. log_info!(logger, "Channel off-chain state closed after we broadcasted our latest commitment transaction."); @@ -3484,7 +3484,7 @@ impl ChannelMonitorImpl { let funding_outp = HolderFundingOutput::build(self.funding_redeemscript.clone(), self.channel_value_satoshis, self.onchain_tx_handler.channel_type_features().clone()); let commitment_package = PackageTemplate::build_package(self.funding_info.0.txid.clone(), self.funding_info.0.index as u32, PackageSolvingData::HolderFundingOutput(funding_outp), self.best_block.height(), self.best_block.height()); claimable_outpoints.push(commitment_package); - self.pending_monitor_events.push(MonitorEvent::CommitmentTxConfirmed(self.funding_info.0)); + self.pending_monitor_events.push(MonitorEvent::HolderForceClosed(self.funding_info.0)); // Although we aren't signing the transaction directly here, the transaction will be signed // in the claim that is queued to OnchainTxHandler. We set holder_tx_signed here to reject // new channel updates. @@ -4248,7 +4248,7 @@ impl<'a, 'b, ES: EntropySource, SP: SignerProvider> ReadableArgs<(&'a ES, &'b SP for _ in 0..pending_monitor_events_len { let ev = match ::read(reader)? { 0 => MonitorEvent::HTLCEvent(Readable::read(reader)?), - 1 => MonitorEvent::CommitmentTxConfirmed(funding_info.0), + 1 => MonitorEvent::HolderForceClosed(funding_info.0), _ => return Err(DecodeError::InvalidValue) }; pending_monitor_events.as_mut().unwrap().push(ev); diff --git a/lightning/src/ln/channelmanager.rs b/lightning/src/ln/channelmanager.rs index b068b7a7496..45873631290 100644 --- a/lightning/src/ln/channelmanager.rs +++ b/lightning/src/ln/channelmanager.rs @@ -6729,7 +6729,7 @@ where self.fail_htlc_backwards_internal(&htlc_update.source, &htlc_update.payment_hash, &reason, receiver); } }, - MonitorEvent::CommitmentTxConfirmed(funding_outpoint) => { + MonitorEvent::HolderForceClosed(funding_outpoint) => { let counterparty_node_id_opt = match counterparty_node_id { Some(cp_id) => Some(cp_id), None => { @@ -6753,7 +6753,7 @@ where msg: update }); } - self.issue_channel_close_events(&chan.context, ClosureReason::CommitmentTxConfirmed); + self.issue_channel_close_events(&chan.context, ClosureReason::HolderForceClosed); pending_msg_events.push(events::MessageSendEvent::HandleError { node_id: chan.context.get_counterparty_node_id(), action: msgs::ErrorAction::SendErrorMessage { diff --git a/lightning/src/ln/functional_tests.rs b/lightning/src/ln/functional_tests.rs index cf27d6787c4..fa766f0ab55 100644 --- a/lightning/src/ln/functional_tests.rs +++ b/lightning/src/ln/functional_tests.rs @@ -2408,6 +2408,7 @@ fn channel_monitor_network_test() { } check_added_monitors!(nodes[4], 1); test_txn_broadcast(&nodes[4], &chan_4, None, HTLCType::SUCCESS); + check_closed_event!(nodes[4], 1, ClosureReason::HolderForceClosed, [nodes[3].node.get_our_node_id()], 100000); mine_transaction(&nodes[4], &node_txn[0]); check_preimage_claim(&nodes[4], &node_txn); @@ -2420,8 +2421,7 @@ fn channel_monitor_network_test() { assert_eq!(nodes[3].chain_monitor.chain_monitor.watch_channel(OutPoint { txid: chan_3.3.txid(), index: 0 }, chan_3_mon), Ok(ChannelMonitorUpdateStatus::Completed)); - check_closed_event!(nodes[3], 1, ClosureReason::CommitmentTxConfirmed, [nodes[4].node.get_our_node_id()], 100000); - check_closed_event!(nodes[4], 1, ClosureReason::CommitmentTxConfirmed, [nodes[3].node.get_our_node_id()], 100000); + check_closed_event!(nodes[3], 1, ClosureReason::HolderForceClosed, [nodes[4].node.get_our_node_id()], 100000); } #[test] @@ -5660,7 +5660,7 @@ fn do_htlc_claim_local_commitment_only(use_dust: bool) { test_txn_broadcast(&nodes[1], &chan, None, if use_dust { HTLCType::NONE } else { HTLCType::SUCCESS }); check_closed_broadcast!(nodes[1], true); check_added_monitors!(nodes[1], 1); - check_closed_event!(nodes[1], 1, ClosureReason::CommitmentTxConfirmed, [nodes[0].node.get_our_node_id()], 100000); + check_closed_event!(nodes[1], 1, ClosureReason::HolderForceClosed, [nodes[0].node.get_our_node_id()], 100000); } fn do_htlc_claim_current_remote_commitment_only(use_dust: bool) { @@ -5691,7 +5691,7 @@ fn do_htlc_claim_current_remote_commitment_only(use_dust: bool) { test_txn_broadcast(&nodes[0], &chan, None, HTLCType::NONE); check_closed_broadcast!(nodes[0], true); check_added_monitors!(nodes[0], 1); - check_closed_event!(nodes[0], 1, ClosureReason::CommitmentTxConfirmed, [nodes[1].node.get_our_node_id()], 100000); + check_closed_event!(nodes[0], 1, ClosureReason::HolderForceClosed, [nodes[1].node.get_our_node_id()], 100000); } fn do_htlc_claim_previous_remote_commitment_only(use_dust: bool, check_revoke_no_close: bool) { @@ -5737,7 +5737,7 @@ fn do_htlc_claim_previous_remote_commitment_only(use_dust: bool, check_revoke_no test_txn_broadcast(&nodes[0], &chan, None, HTLCType::NONE); check_closed_broadcast!(nodes[0], true); check_added_monitors!(nodes[0], 1); - check_closed_event!(nodes[0], 1, ClosureReason::CommitmentTxConfirmed, [nodes[1].node.get_our_node_id()], 100000); + check_closed_event!(nodes[0], 1, ClosureReason::HolderForceClosed, [nodes[1].node.get_our_node_id()], 100000); } else { expect_payment_failed!(nodes[0], our_payment_hash, true); } @@ -8603,7 +8603,7 @@ fn test_concurrent_monitor_claim() { let height = HTLC_TIMEOUT_BROADCAST + 1; connect_blocks(&nodes[0], height - nodes[0].best_block_info().1); check_closed_broadcast(&nodes[0], 1, true); - check_closed_event!(&nodes[0], 1, ClosureReason::CommitmentTxConfirmed, false, + check_closed_event!(&nodes[0], 1, ClosureReason::HolderForceClosed, false, [nodes[1].node.get_our_node_id()], 100000); watchtower_alice.chain_monitor.block_connected(&create_dummy_block(BlockHash::all_zeros(), 42, vec![bob_state_y.clone()]), height); check_added_monitors(&nodes[0], 1); diff --git a/lightning/src/ln/monitor_tests.rs b/lightning/src/ln/monitor_tests.rs index ece24794f63..cb78cda714f 100644 --- a/lightning/src/ln/monitor_tests.rs +++ b/lightning/src/ln/monitor_tests.rs @@ -1046,14 +1046,14 @@ fn do_test_revoked_counterparty_commitment_balances(confirm_htlc_spend_first: bo assert!(failed_payments.is_empty()); if let Event::PendingHTLCsForwardable { .. } = events[0] {} else { panic!(); } match &events[1] { - Event::ChannelClosed { reason: ClosureReason::CommitmentTxConfirmed, .. } => {}, + Event::ChannelClosed { reason: ClosureReason::HolderForceClosed, .. } => {}, _ => panic!(), } connect_blocks(&nodes[1], htlc_cltv_timeout + 1 - 10); check_closed_broadcast!(nodes[1], true); check_added_monitors!(nodes[1], 1); - check_closed_event!(nodes[1], 1, ClosureReason::CommitmentTxConfirmed, [nodes[0].node.get_our_node_id()], 1000000); + check_closed_event!(nodes[1], 1, ClosureReason::HolderForceClosed, [nodes[0].node.get_our_node_id()], 1000000); // Prior to channel closure, B considers the preimage HTLC as its own, and otherwise only // lists the two on-chain timeout-able HTLCs as claimable balances. diff --git a/lightning/src/ln/reorg_tests.rs b/lightning/src/ln/reorg_tests.rs index 7c57b553068..cb3471763d4 100644 --- a/lightning/src/ln/reorg_tests.rs +++ b/lightning/src/ln/reorg_tests.rs @@ -462,7 +462,7 @@ fn test_set_outpoints_partial_claiming() { // Connect blocks on node B connect_blocks(&nodes[1], TEST_FINAL_CLTV + LATENCY_GRACE_PERIOD_BLOCKS + 1); check_closed_broadcast!(nodes[1], true); - check_closed_event!(nodes[1], 1, ClosureReason::CommitmentTxConfirmed, [nodes[0].node.get_our_node_id()], 1000000); + check_closed_event!(nodes[1], 1, ClosureReason::HolderForceClosed, [nodes[0].node.get_our_node_id()], 1000000); check_added_monitors!(nodes[1], 1); // Verify node B broadcast 2 HTLC-timeout txn let partial_claim_tx = { From e5bd7920bddb4f312741673e37a07cd859fd24fb Mon Sep 17 00:00:00 2001 From: Matt Corallo Date: Fri, 8 Sep 2023 00:05:56 +0000 Subject: [PATCH 06/11] Update `ChannelMonitorUpdateStatus` documentation with async support Since we now (almost) support async monitor update persistence, the documentation on `ChannelMonitorUpdateStatus` can be updated to no longer suggest users must keep a local copy that persists before returning. However, because there are still a few remaining issues, we note that async support is currently beta and explicily warn of potential for funds-loss. Fixes #1684 --- lightning/src/chain/chainmonitor.rs | 4 +-- lightning/src/chain/mod.rs | 44 ++++++++++++++++++----------- lightning/src/ln/channelmanager.rs | 14 +++++---- 3 files changed, 37 insertions(+), 25 deletions(-) diff --git a/lightning/src/chain/chainmonitor.rs b/lightning/src/chain/chainmonitor.rs index 4e349799d1c..4986e054a35 100644 --- a/lightning/src/chain/chainmonitor.rs +++ b/lightning/src/chain/chainmonitor.rs @@ -82,8 +82,8 @@ impl MonitorUpdateId { /// * If persistence (including any relevant `fsync()` calls) happens immediately, the /// implementation should return [`ChannelMonitorUpdateStatus::Completed`], indicating normal /// channel operation should continue. -/// * If persistence happens asynchronously, implementations should first ensure the -/// [`ChannelMonitor`] or [`ChannelMonitorUpdate`] are written durably to disk, and then return +/// +/// * If persistence happens asynchronously, implementations can return /// [`ChannelMonitorUpdateStatus::InProgress`] while the update continues in the background. /// Once the update completes, [`ChainMonitor::channel_monitor_updated`] should be called with /// the corresponding [`MonitorUpdateId`]. diff --git a/lightning/src/chain/mod.rs b/lightning/src/chain/mod.rs index 736da139a55..1abaa77f3e4 100644 --- a/lightning/src/chain/mod.rs +++ b/lightning/src/chain/mod.rs @@ -176,6 +176,18 @@ pub trait Confirm { } /// An enum representing the status of a channel monitor update persistence. +/// +/// Note that there is no error variant - any failure to persist a [`ChannelMonitor`] should be +/// retried indefinitely, the node shut down (as if we cannot update stored data we can't do much +/// of anything useful). +/// +/// Note that channels should generally *not* be force-closed after a persistence failure. +/// Force-closing with the latest [`ChannelMonitorUpdate`] applied may result in a transaction +/// being broadcast which can only be spent by the latest [`ChannelMonitor`]! Thus, if the +/// latest [`ChannelMonitor`] is not durably persisted anywhere and exists only in memory, naively +/// calling [`ChannelManager::force_close_broadcasting_latest_txn`] *may result in loss of funds*! +/// +/// [`ChannelManager::force_close_broadcasting_latest_txn`]: crate::ln::channelmanager::ChannelManager::force_close_broadcasting_latest_txn #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub enum ChannelMonitorUpdateStatus { /// The update has been durably persisted and all copies of the relevant [`ChannelMonitor`] @@ -184,13 +196,13 @@ pub enum ChannelMonitorUpdateStatus { /// This includes performing any `fsync()` calls required to ensure the update is guaranteed to /// be available on restart even if the application crashes. Completed, - /// Used to indicate a temporary failure (eg connection to a watchtower or remote backup of - /// our state failed, but is expected to succeed at some point in the future). + /// Indicates that the update will happen asynchronously in the background or that a transient + /// failure occurred which is being retried in the background and will eventually complete. /// - /// Such a failure will "freeze" a channel, preventing us from revoking old states or - /// submitting new commitment transactions to the counterparty. Once the update(s) which failed - /// have been successfully applied, a [`MonitorEvent::Completed`] can be used to restore the - /// channel to an operational state. + /// This will "freeze" a channel, preventing us from revoking old states or submitting a new + /// commitment transaction to the counterparty. Once the update(s) which are `InProgress` have + /// been completed, a [`MonitorEvent::Completed`] can be used to restore the channel to an + /// operational state. /// /// Even when a channel has been "frozen", updates to the [`ChannelMonitor`] can continue to /// occur (e.g. if an inbound HTLC which we forwarded was claimed upstream, resulting in us @@ -204,6 +216,10 @@ pub enum ChannelMonitorUpdateStatus { /// remote location (with local copies persisted immediately), it is anticipated that all /// updates will return [`InProgress`] until the remote copies could be updated. /// + /// Note that while fully asynchronous persistence of [`ChannelMonitor`] data is generally + /// reliable, this feature is considered beta, and a handful of edge-cases remain. Until the + /// remaining cases are fixed, in rare cases, *using this feature may lead to funds loss*. + /// /// [`InProgress`]: ChannelMonitorUpdateStatus::InProgress InProgress, } @@ -212,18 +228,12 @@ pub enum ChannelMonitorUpdateStatus { /// blocks are connected and disconnected. /// /// Each channel is associated with a [`ChannelMonitor`]. Implementations of this trait are -/// responsible for maintaining a set of monitors such that they can be updated accordingly as -/// channel state changes and HTLCs are resolved. See method documentation for specific -/// requirements. -/// -/// Implementations **must** ensure that updates are successfully applied and persisted upon method -/// completion. If an update will not succeed, then it must immediately shut down. +/// responsible for maintaining a set of monitors such that they can be updated as channel state +/// changes. On each update, *all copies* of a [`ChannelMonitor`] must be updated and the update +/// persisted to disk to ensure that the latest [`ChannelMonitor`] state can be reloaded if the +/// application crashes. /// -/// If an implementation maintains multiple instances of a channel's monitor (e.g., by storing -/// backup copies), then it must ensure that updates are applied across all instances. Otherwise, it -/// could result in a revoked transaction being broadcast, allowing the counterparty to claim all -/// funds in the channel. See [`ChannelMonitorUpdateStatus`] for more details about how to handle -/// multiple instances. +/// See method documentation and [`ChannelMonitorUpdateStatus`] for specific requirements. pub trait Watch { /// Watches a channel identified by `funding_txo` using `monitor`. /// diff --git a/lightning/src/ln/channelmanager.rs b/lightning/src/ln/channelmanager.rs index 45873631290..a7e7375f9a4 100644 --- a/lightning/src/ln/channelmanager.rs +++ b/lightning/src/ln/channelmanager.rs @@ -923,12 +923,14 @@ where /// called [`funding_transaction_generated`] for outbound channels) being closed. /// /// Note that you can be a bit lazier about writing out `ChannelManager` than you can be with -/// [`ChannelMonitor`]. With [`ChannelMonitor`] you MUST write each monitor update out to disk before -/// returning from [`chain::Watch::watch_channel`]/[`update_channel`], with ChannelManagers, writing updates -/// happens out-of-band (and will prevent any other `ChannelManager` operations from occurring during -/// the serialization process). If the deserialized version is out-of-date compared to the -/// [`ChannelMonitor`] passed by reference to [`read`], those channels will be force-closed based on the -/// `ChannelMonitor` state and no funds will be lost (mod on-chain transaction fees). +/// [`ChannelMonitor`]. With [`ChannelMonitor`] you MUST durably write each +/// [`ChannelMonitorUpdate`] before returning from +/// [`chain::Watch::watch_channel`]/[`update_channel`] or before completing async writes. With +/// `ChannelManager`s, writing updates happens out-of-band (and will prevent any other +/// `ChannelManager` operations from occurring during the serialization process). If the +/// deserialized version is out-of-date compared to the [`ChannelMonitor`] passed by reference to +/// [`read`], those channels will be force-closed based on the `ChannelMonitor` state and no funds +/// will be lost (modulo on-chain transaction fees). /// /// Note that the deserializer is only implemented for `(`[`BlockHash`]`, `[`ChannelManager`]`)`, which /// tells you the last block hash which was connected. You should get the best block tip before using the manager. From 574c77e7bc95fd8dea5a8058b6b35996cc99db8d Mon Sep 17 00:00:00 2001 From: Matt Corallo Date: Sun, 10 Sep 2023 17:39:24 +0000 Subject: [PATCH 07/11] Drop `PermamentFailure` persistence handling in ChannelManager --- lightning/src/ln/channelmanager.rs | 131 +++++++++-------------------- 1 file changed, 38 insertions(+), 93 deletions(-) diff --git a/lightning/src/ln/channelmanager.rs b/lightning/src/ln/channelmanager.rs index a7e7375f9a4..8e2af31dadd 100644 --- a/lightning/src/ln/channelmanager.rs +++ b/lightning/src/ln/channelmanager.rs @@ -2051,11 +2051,11 @@ macro_rules! handle_new_monitor_update { ChannelMonitorUpdateStatus::InProgress => { log_debug!($self.logger, "ChannelMonitor update for {} in flight, holding messages until the update completes.", &$chan.context.channel_id()); - Ok(false) + false }, ChannelMonitorUpdateStatus::Completed => { $completed; - Ok(true) + true }, } } }; @@ -2070,14 +2070,9 @@ macro_rules! handle_new_monitor_update { $per_peer_state_lock, chan, MANUALLY_REMOVING_INITIAL_MONITOR, { $chan_entry.remove() }) } else { // We're not supposed to handle monitor updates for unfunded channels (they have no monitors to - // update). - debug_assert!(false); - let channel_id = *$chan_entry.key(); - let (_, err) = convert_chan_phase_err!($self, ChannelError::Close( - "Cannot update monitor for unfunded channels as they don't have monitors yet".into()), - $chan_entry.get_mut(), &channel_id); - $chan_entry.remove(); - Err(err) + // update). Throwing away a monitor update could be dangerous, so we assert even in + // release builds. + panic!("Initial Monitors should not exist for non-funded channels"); } }; ($self: ident, $funding_txo: expr, $update: expr, $peer_state_lock: expr, $peer_state: expr, $per_peer_state_lock: expr, $chan: expr, MANUALLY_REMOVING, $remove: expr) => { { @@ -2107,14 +2102,9 @@ macro_rules! handle_new_monitor_update { $per_peer_state_lock, chan, MANUALLY_REMOVING, { $chan_entry.remove() }) } else { // We're not supposed to handle monitor updates for unfunded channels (they have no monitors to - // update). - debug_assert!(false); - let channel_id = *$chan_entry.key(); - let (_, err) = convert_chan_phase_err!($self, ChannelError::Close( - "Cannot update monitor for unfunded channels as they don't have monitors yet".into()), - $chan_entry.get_mut(), &channel_id); - $chan_entry.remove(); - Err(err) + // update). Throwing away a monitor update could be dangerous, so we assert even in + // release builds. + panic!("Monitor updates should not exist for non-funded channels"); } } } @@ -2529,7 +2519,7 @@ where let _persistence_guard = PersistenceNotifierGuard::notify_on_drop(self); let mut failed_htlcs: Vec<(HTLCSource, PaymentHash)>; - let result: Result<(), _> = loop { + loop { { let per_peer_state = self.per_peer_state.read().unwrap(); @@ -2558,8 +2548,9 @@ where // Update the monitor with the shutdown script if necessary. if let Some(monitor_update) = monitor_update_opt.take() { - break handle_new_monitor_update!(self, funding_txo_opt.unwrap(), monitor_update, - peer_state_lock, peer_state, per_peer_state, chan_phase_entry).map(|_| ()); + handle_new_monitor_update!(self, funding_txo_opt.unwrap(), monitor_update, + peer_state_lock, peer_state, per_peer_state, chan_phase_entry); + break; } if chan.is_shutdown() { @@ -2572,7 +2563,7 @@ where self.issue_channel_close_events(&chan.context, ClosureReason::HolderForceClosed); } } - break Ok(()); + break; } }, hash_map::Entry::Vacant(_) => (), @@ -2591,7 +2582,6 @@ where self.fail_htlc_backwards_internal(&htlc_source.0, &htlc_source.1, &reason, receiver); } - let _ = handle_error!(self, result, *counterparty_node_id); Ok(()) } @@ -3334,8 +3324,7 @@ where match break_chan_phase_entry!(self, send_res, chan_phase_entry) { Some(monitor_update) => { match handle_new_monitor_update!(self, funding_txo, monitor_update, peer_state_lock, peer_state, per_peer_state, chan_phase_entry) { - Err(e) => break Err(e), - Ok(false) => { + false => { // Note that MonitorUpdateInProgress here indicates (per function // docs) that we will resend the commitment update once monitor // updating completes. Therefore, we must return an error @@ -3344,7 +3333,7 @@ where // MonitorUpdateInProgress, below. return Err(APIError::MonitorUpdateInProgress); }, - Ok(true) => {}, + true => {}, } }, None => {}, @@ -4526,7 +4515,7 @@ where }, BackgroundEvent::MonitorUpdateRegeneratedOnStartup { counterparty_node_id, funding_txo, update } => { let mut updated_chan = false; - let res = { + { let per_peer_state = self.per_peer_state.read().unwrap(); if let Some(peer_state_mutex) = per_peer_state.get(&counterparty_node_id) { let mut peer_state_lock = peer_state_mutex.lock().unwrap(); @@ -4535,24 +4524,16 @@ where hash_map::Entry::Occupied(mut chan_phase) => { updated_chan = true; handle_new_monitor_update!(self, funding_txo, update.clone(), - peer_state_lock, peer_state, per_peer_state, chan_phase).map(|_| ()) + peer_state_lock, peer_state, per_peer_state, chan_phase); }, - hash_map::Entry::Vacant(_) => Ok(()), + hash_map::Entry::Vacant(_) => {}, } - } else { Ok(()) } - }; + } + } if !updated_chan { // TODO: Track this as in-flight even though the channel is closed. let _ = self.chain_monitor.update_channel(funding_txo, &update); } - // TODO: If this channel has since closed, we're likely providing a payment - // preimage update, which we must ensure is durable! We currently don't, - // however, ensure that. - if res.is_err() { - log_error!(self.logger, - "Failed to provide ChannelMonitorUpdate to closed channel! This likely lost us a payment preimage!"); - } - let _ = handle_error!(self, res, counterparty_node_id); }, BackgroundEvent::MonitorUpdatesComplete { counterparty_node_id, channel_id } => { let per_peer_state = self.per_peer_state.read().unwrap(); @@ -5275,15 +5256,8 @@ where peer_state.monitor_update_blocked_actions.entry(chan_id).or_insert(Vec::new()).push(action); } if !during_init { - let res = handle_new_monitor_update!(self, prev_hop.outpoint, monitor_update, peer_state_lock, + handle_new_monitor_update!(self, prev_hop.outpoint, monitor_update, peer_state_lock, peer_state, per_peer_state, chan_phase_entry); - if let Err(e) = res { - // TODO: This is a *critical* error - we probably updated the outbound edge - // of the HTLC's monitor with a preimage. We should retry this monitor - // update over and over again until morale improves. - log_error!(self.logger, "Failed to update channel monitor with preimage {:?}", payment_preimage); - return Err((counterparty_node_id, e)); - } } else { // If we're running during init we cannot update a monitor directly - // they probably haven't actually been loaded yet. Instead, push the @@ -5934,24 +5908,13 @@ where }); if let ChannelPhase::Funded(chan) = e.insert(ChannelPhase::Funded(chan)) { - let mut res = handle_new_monitor_update!(self, persist_state, peer_state_lock, peer_state, + handle_new_monitor_update!(self, persist_state, peer_state_lock, peer_state, per_peer_state, chan, MANUALLY_REMOVING_INITIAL_MONITOR, { peer_state.channel_by_id.remove(&new_channel_id) }); - - // Note that we reply with the new channel_id in error messages if we gave up on the - // channel, not the temporary_channel_id. This is compatible with ourselves, but the - // spec is somewhat ambiguous here. Not a huge deal since we'll send error messages for - // any messages referencing a previously-closed channel anyway. - // We do not propagate the monitor update to the user as it would be for a monitor - // that we didn't manage to store (and that we don't care about - we don't respond - // with the funding_signed so the channel can never go on chain). - if let Err(MsgHandleErrInternal { shutdown_finish: Some((res, _)), .. }) = &mut res { - res.0 = None; - } - res.map(|_| ()) } else { unreachable!("This must be a funded channel as we just inserted it."); } + Ok(()) } else { log_error!(self.logger, "Persisting initial ChannelMonitor failed, implying the funding outpoint was duplicated"); return Err(MsgHandleErrInternal::send_err_msg_no_close( @@ -5982,16 +5945,8 @@ where let monitor = try_chan_phase_entry!(self, chan.funding_signed(&msg, best_block, &self.signer_provider, &self.logger), chan_phase_entry); if let Ok(persist_status) = self.chain_monitor.watch_channel(chan.context.get_funding_txo().unwrap(), monitor) { - let mut res = handle_new_monitor_update!(self, persist_status, peer_state_lock, peer_state, per_peer_state, chan_phase_entry, INITIAL_MONITOR); - if let Err(MsgHandleErrInternal { ref mut shutdown_finish, .. }) = res { - // We weren't able to watch the channel to begin with, so no updates should be made on - // it. Previously, full_stack_target found an (unreachable) panic when the - // monitor update contained within `shutdown_finish` was applied. - if let Some((ref mut shutdown_finish, _)) = shutdown_finish { - shutdown_finish.0.take(); - } - } - res.map(|_| ()) + handle_new_monitor_update!(self, persist_status, peer_state_lock, peer_state, per_peer_state, chan_phase_entry, INITIAL_MONITOR); + Ok(()) } else { try_chan_phase_entry!(self, Err(ChannelError::Close("Channel funding outpoint was a duplicate".to_owned())), chan_phase_entry) } @@ -6096,8 +6051,8 @@ where } // Update the monitor with the shutdown script if necessary. if let Some(monitor_update) = monitor_update_opt { - break handle_new_monitor_update!(self, funding_txo_opt.unwrap(), monitor_update, - peer_state_lock, peer_state, per_peer_state, chan_phase_entry).map(|_| ()); + handle_new_monitor_update!(self, funding_txo_opt.unwrap(), monitor_update, + peer_state_lock, peer_state, per_peer_state, chan_phase_entry); } break Ok(()); }, @@ -6350,8 +6305,9 @@ where let monitor_update_opt = try_chan_phase_entry!(self, chan.commitment_signed(&msg, &self.logger), chan_phase_entry); if let Some(monitor_update) = monitor_update_opt { handle_new_monitor_update!(self, funding_txo.unwrap(), monitor_update, peer_state_lock, - peer_state, per_peer_state, chan_phase_entry).map(|_| ()) - } else { Ok(()) } + peer_state, per_peer_state, chan_phase_entry); + } + Ok(()) } else { return try_chan_phase_entry!(self, Err(ChannelError::Close( "Got a commitment_signed message for an unfunded channel!".into())), chan_phase_entry); @@ -6519,13 +6475,13 @@ where } else { false }; let (htlcs_to_fail, monitor_update_opt) = try_chan_phase_entry!(self, chan.revoke_and_ack(&msg, &self.fee_estimator, &self.logger, mon_update_blocked), chan_phase_entry); - let res = if let Some(monitor_update) = monitor_update_opt { + if let Some(monitor_update) = monitor_update_opt { let funding_txo = funding_txo_opt .expect("Funding outpoint must have been set for RAA handling to succeed"); handle_new_monitor_update!(self, funding_txo, monitor_update, - peer_state_lock, peer_state, per_peer_state, chan_phase_entry).map(|_| ()) - } else { Ok(()) }; - (htlcs_to_fail, res) + peer_state_lock, peer_state, per_peer_state, chan_phase_entry); + } + (htlcs_to_fail, Ok(())) } else { return try_chan_phase_entry!(self, Err(ChannelError::Close( "Got a revoke_and_ack message for an unfunded channel!".into())), chan_phase_entry); @@ -6796,7 +6752,6 @@ where fn check_free_holding_cells(&self) -> bool { let mut has_monitor_update = false; let mut failed_htlcs = Vec::new(); - let mut handle_errors = Vec::new(); // Walk our list of channels and find any that need to update. Note that when we do find an // update, if it includes actions that must be taken afterwards, we have to drop the @@ -6822,12 +6777,9 @@ where has_monitor_update = true; let channel_id: ChannelId = *channel_id; - let res = handle_new_monitor_update!(self, funding_txo.unwrap(), monitor_update, + handle_new_monitor_update!(self, funding_txo.unwrap(), monitor_update, peer_state_lock, peer_state, per_peer_state, chan, MANUALLY_REMOVING, peer_state.channel_by_id.remove(&channel_id)); - if res.is_err() { - handle_errors.push((counterparty_node_id, res)); - } continue 'peer_loop; } } @@ -6837,15 +6789,11 @@ where break 'peer_loop; } - let has_update = has_monitor_update || !failed_htlcs.is_empty() || !handle_errors.is_empty(); + let has_update = has_monitor_update || !failed_htlcs.is_empty(); for (failures, channel_id, counterparty_node_id) in failed_htlcs.drain(..) { self.fail_holding_cell_htlcs(failures, channel_id, &counterparty_node_id); } - for (counterparty_node_id, err) in handle_errors.drain(..) { - let _ = handle_error!(self, err, counterparty_node_id); - } - has_update } @@ -7172,11 +7120,8 @@ where if let Some((monitor_update, further_update_exists)) = chan.unblock_next_blocked_monitor_update() { log_debug!(self.logger, "Unlocking monitor updating for channel {} and updating monitor", channel_funding_outpoint.to_channel_id()); - if let Err(e) = handle_new_monitor_update!(self, channel_funding_outpoint, monitor_update, - peer_state_lck, peer_state, per_peer_state, chan_phase_entry) - { - errors.push((e, counterparty_node_id)); - } + handle_new_monitor_update!(self, channel_funding_outpoint, monitor_update, + peer_state_lck, peer_state, per_peer_state, chan_phase_entry); if further_update_exists { // If there are more `ChannelMonitorUpdate`s to process, restart at the // top of the loop. From 341163eeb599a8c5c79e06f0e6d6e8fb2a1e756d Mon Sep 17 00:00:00 2001 From: Matt Corallo Date: Sun, 10 Sep 2023 17:48:49 +0000 Subject: [PATCH 08/11] Clean up code flow in `ChannelManager` In the previous commit various dead code was removed. Here we finish that cleanup by removing uneccessary indentation and syntax. --- lightning/src/ln/channelmanager.rs | 105 +++++++++++++++-------------- 1 file changed, 53 insertions(+), 52 deletions(-) diff --git a/lightning/src/ln/channelmanager.rs b/lightning/src/ln/channelmanager.rs index 8e2af31dadd..29c0141539b 100644 --- a/lightning/src/ln/channelmanager.rs +++ b/lightning/src/ln/channelmanager.rs @@ -2520,61 +2520,63 @@ where let mut failed_htlcs: Vec<(HTLCSource, PaymentHash)>; loop { - { - let per_peer_state = self.per_peer_state.read().unwrap(); + let per_peer_state = self.per_peer_state.read().unwrap(); - let peer_state_mutex = per_peer_state.get(counterparty_node_id) - .ok_or_else(|| APIError::ChannelUnavailable { err: format!("Can't find a peer matching the passed counterparty node_id {}", counterparty_node_id) })?; + let peer_state_mutex = per_peer_state.get(counterparty_node_id) + .ok_or_else(|| APIError::ChannelUnavailable { err: format!("Can't find a peer matching the passed counterparty node_id {}", counterparty_node_id) })?; - let mut peer_state_lock = peer_state_mutex.lock().unwrap(); - let peer_state = &mut *peer_state_lock; + let mut peer_state_lock = peer_state_mutex.lock().unwrap(); + let peer_state = &mut *peer_state_lock; - match peer_state.channel_by_id.entry(channel_id.clone()) { - hash_map::Entry::Occupied(mut chan_phase_entry) => { - if let ChannelPhase::Funded(chan) = chan_phase_entry.get_mut() { - let funding_txo_opt = chan.context.get_funding_txo(); - let their_features = &peer_state.latest_features; - let (shutdown_msg, mut monitor_update_opt, htlcs) = - chan.get_shutdown(&self.signer_provider, their_features, target_feerate_sats_per_1000_weight, override_shutdown_script)?; - failed_htlcs = htlcs; + match peer_state.channel_by_id.entry(channel_id.clone()) { + hash_map::Entry::Occupied(mut chan_phase_entry) => { + if let ChannelPhase::Funded(chan) = chan_phase_entry.get_mut() { + let funding_txo_opt = chan.context.get_funding_txo(); + let their_features = &peer_state.latest_features; + let (shutdown_msg, mut monitor_update_opt, htlcs) = + chan.get_shutdown(&self.signer_provider, their_features, target_feerate_sats_per_1000_weight, override_shutdown_script)?; + failed_htlcs = htlcs; + + // We can send the `shutdown` message before updating the `ChannelMonitor` + // here as we don't need the monitor update to complete until we send a + // `shutdown_signed`, which we'll delay if we're pending a monitor update. + peer_state.pending_msg_events.push(events::MessageSendEvent::SendShutdown { + node_id: *counterparty_node_id, + msg: shutdown_msg, + }); - // We can send the `shutdown` message before updating the `ChannelMonitor` - // here as we don't need the monitor update to complete until we send a - // `shutdown_signed`, which we'll delay if we're pending a monitor update. - peer_state.pending_msg_events.push(events::MessageSendEvent::SendShutdown { - node_id: *counterparty_node_id, - msg: shutdown_msg, - }); + debug_assert!(monitor_update_opt.is_none() || !chan.is_shutdown(), + "We can't both complete shutdown and generate a monitor update"); - // Update the monitor with the shutdown script if necessary. - if let Some(monitor_update) = monitor_update_opt.take() { - handle_new_monitor_update!(self, funding_txo_opt.unwrap(), monitor_update, - peer_state_lock, peer_state, per_peer_state, chan_phase_entry); - break; - } + // Update the monitor with the shutdown script if necessary. + if let Some(monitor_update) = monitor_update_opt.take() { + handle_new_monitor_update!(self, funding_txo_opt.unwrap(), monitor_update, + peer_state_lock, peer_state, per_peer_state, chan_phase_entry); + break; + } - if chan.is_shutdown() { - if let ChannelPhase::Funded(chan) = remove_channel_phase!(self, chan_phase_entry) { - if let Ok(channel_update) = self.get_channel_update_for_broadcast(&chan) { - peer_state.pending_msg_events.push(events::MessageSendEvent::BroadcastChannelUpdate { - msg: channel_update - }); - } - self.issue_channel_close_events(&chan.context, ClosureReason::HolderForceClosed); + if chan.is_shutdown() { + if let ChannelPhase::Funded(chan) = remove_channel_phase!(self, chan_phase_entry) { + if let Ok(channel_update) = self.get_channel_update_for_broadcast(&chan) { + peer_state.pending_msg_events.push(events::MessageSendEvent::BroadcastChannelUpdate { + msg: channel_update + }); } + self.issue_channel_close_events(&chan.context, ClosureReason::HolderForceClosed); } - break; } - }, - hash_map::Entry::Vacant(_) => (), - } + break; + } + }, + hash_map::Entry::Vacant(_) => { + // If we reach this point, it means that the channel_id either refers to an unfunded channel or + // it does not exist for this peer. Either way, we can attempt to force-close it. + // + // An appropriate error will be returned for non-existence of the channel if that's the case. + return self.force_close_channel_with_peer(&channel_id, counterparty_node_id, None, false).map(|_| ()) + }, } - // If we reach this point, it means that the channel_id either refers to an unfunded channel or - // it does not exist for this peer. Either way, we can attempt to force-close it. - // - // An appropriate error will be returned for non-existence of the channel if that's the case. - return self.force_close_channel_with_peer(&channel_id, counterparty_node_id, None, false).map(|_| ()) - }; + } for htlc_source in failed_htlcs.drain(..) { let reason = HTLCFailReason::from_failure_code(0x4000 | 8); @@ -6016,7 +6018,7 @@ where fn internal_shutdown(&self, counterparty_node_id: &PublicKey, msg: &msgs::Shutdown) -> Result<(), MsgHandleErrInternal> { let mut dropped_htlcs: Vec<(HTLCSource, PaymentHash)>; - let result: Result<(), _> = loop { + { let per_peer_state = self.per_peer_state.read().unwrap(); let peer_state_mutex = per_peer_state.get(counterparty_node_id) .ok_or_else(|| { @@ -6054,7 +6056,6 @@ where handle_new_monitor_update!(self, funding_txo_opt.unwrap(), monitor_update, peer_state_lock, peer_state, per_peer_state, chan_phase_entry); } - break Ok(()); }, ChannelPhase::UnfundedInboundV1(_) | ChannelPhase::UnfundedOutboundV1(_) => { let context = phase.context_mut(); @@ -6068,14 +6069,14 @@ where } else { return Err(MsgHandleErrInternal::send_err_msg_no_close(format!("Got a message for a channel from the wrong node! No such channel for the passed counterparty_node_id {}", counterparty_node_id), msg.channel_id)) } - }; + } for htlc_source in dropped_htlcs.drain(..) { let receiver = HTLCDestination::NextHopChannel { node_id: Some(counterparty_node_id.clone()), channel_id: msg.channel_id }; let reason = HTLCFailReason::from_failure_code(0x4000 | 8); self.fail_htlc_backwards_internal(&htlc_source.0, &htlc_source.1, &reason, receiver); } - result + Ok(()) } fn internal_closing_signed(&self, counterparty_node_id: &PublicKey, msg: &msgs::ClosingSigned) -> Result<(), MsgHandleErrInternal> { @@ -6456,7 +6457,7 @@ where } fn internal_revoke_and_ack(&self, counterparty_node_id: &PublicKey, msg: &msgs::RevokeAndACK) -> Result<(), MsgHandleErrInternal> { - let (htlcs_to_fail, res) = { + let htlcs_to_fail = { let per_peer_state = self.per_peer_state.read().unwrap(); let mut peer_state_lock = per_peer_state.get(counterparty_node_id) .ok_or_else(|| { @@ -6481,7 +6482,7 @@ where handle_new_monitor_update!(self, funding_txo, monitor_update, peer_state_lock, peer_state, per_peer_state, chan_phase_entry); } - (htlcs_to_fail, Ok(())) + htlcs_to_fail } else { return try_chan_phase_entry!(self, Err(ChannelError::Close( "Got a revoke_and_ack message for an unfunded channel!".into())), chan_phase_entry); @@ -6491,7 +6492,7 @@ where } }; self.fail_holding_cell_htlcs(htlcs_to_fail, msg.channel_id, counterparty_node_id); - res + Ok(()) } fn internal_update_fee(&self, counterparty_node_id: &PublicKey, msg: &msgs::UpdateFee) -> Result<(), MsgHandleErrInternal> { From 971f7a7e42652d5697d761df6650ae9e6dcaa5db Mon Sep 17 00:00:00 2001 From: Matt Corallo Date: Sun, 10 Sep 2023 20:05:43 +0000 Subject: [PATCH 09/11] Drop error handling in `handle_new_monitor_update` Now that `handle_new_monitor_update` can no longer return an error, it similarly no longer needs any code to handle errors. Here we remove that code, substantially reducing macro variants. --- lightning/src/ln/channelmanager.rs | 76 +++++++++--------------------- 1 file changed, 22 insertions(+), 54 deletions(-) diff --git a/lightning/src/ln/channelmanager.rs b/lightning/src/ln/channelmanager.rs index 29c0141539b..747a78e8c91 100644 --- a/lightning/src/ln/channelmanager.rs +++ b/lightning/src/ln/channelmanager.rs @@ -2042,10 +2042,7 @@ macro_rules! handle_monitor_update_completion { } macro_rules! handle_new_monitor_update { - ($self: ident, $update_res: expr, $peer_state_lock: expr, $peer_state: expr, $per_peer_state_lock: expr, $chan: expr, _internal, $remove: expr, $completed: expr) => { { - // update_maps_on_chan_removal needs to be able to take id_to_peer, so make sure we can in - // any case so that it won't deadlock. - debug_assert_ne!($self.id_to_peer.held_by_thread(), LockHeldState::HeldByThread); + ($self: ident, $update_res: expr, $chan: expr, _internal, $completed: expr) => { { debug_assert!($self.background_events_processed_since_startup.load(Ordering::Acquire)); match $update_res { ChannelMonitorUpdateStatus::InProgress => { @@ -2059,23 +2056,11 @@ macro_rules! handle_new_monitor_update { }, } } }; - ($self: ident, $update_res: expr, $peer_state_lock: expr, $peer_state: expr, $per_peer_state_lock: expr, $chan: expr, MANUALLY_REMOVING_INITIAL_MONITOR, $remove: expr) => { - handle_new_monitor_update!($self, $update_res, $peer_state_lock, $peer_state, - $per_peer_state_lock, $chan, _internal, $remove, + ($self: ident, $update_res: expr, $peer_state_lock: expr, $peer_state: expr, $per_peer_state_lock: expr, $chan: expr, INITIAL_MONITOR) => { + handle_new_monitor_update!($self, $update_res, $chan, _internal, handle_monitor_update_completion!($self, $peer_state_lock, $peer_state, $per_peer_state_lock, $chan)) }; - ($self: ident, $update_res: expr, $peer_state_lock: expr, $peer_state: expr, $per_peer_state_lock: expr, $chan_entry: expr, INITIAL_MONITOR) => { - if let ChannelPhase::Funded(chan) = $chan_entry.get_mut() { - handle_new_monitor_update!($self, $update_res, $peer_state_lock, $peer_state, - $per_peer_state_lock, chan, MANUALLY_REMOVING_INITIAL_MONITOR, { $chan_entry.remove() }) - } else { - // We're not supposed to handle monitor updates for unfunded channels (they have no monitors to - // update). Throwing away a monitor update could be dangerous, so we assert even in - // release builds. - panic!("Initial Monitors should not exist for non-funded channels"); - } - }; - ($self: ident, $funding_txo: expr, $update: expr, $peer_state_lock: expr, $peer_state: expr, $per_peer_state_lock: expr, $chan: expr, MANUALLY_REMOVING, $remove: expr) => { { + ($self: ident, $funding_txo: expr, $update: expr, $peer_state_lock: expr, $peer_state: expr, $per_peer_state_lock: expr, $chan: expr) => { { let in_flight_updates = $peer_state.in_flight_monitor_updates.entry($funding_txo) .or_insert_with(Vec::new); // During startup, we push monitor updates as background events through to here in @@ -2087,8 +2072,7 @@ macro_rules! handle_new_monitor_update { in_flight_updates.len() - 1 }); let update_res = $self.chain_monitor.update_channel($funding_txo, &in_flight_updates[idx]); - handle_new_monitor_update!($self, update_res, $peer_state_lock, $peer_state, - $per_peer_state_lock, $chan, _internal, $remove, + handle_new_monitor_update!($self, update_res, $chan, _internal, { let _ = in_flight_updates.remove(idx); if in_flight_updates.is_empty() && $chan.blocked_monitor_updates_pending() == 0 { @@ -2096,17 +2080,6 @@ macro_rules! handle_new_monitor_update { } }) } }; - ($self: ident, $funding_txo: expr, $update: expr, $peer_state_lock: expr, $peer_state: expr, $per_peer_state_lock: expr, $chan_entry: expr) => { - if let ChannelPhase::Funded(chan) = $chan_entry.get_mut() { - handle_new_monitor_update!($self, $funding_txo, $update, $peer_state_lock, $peer_state, - $per_peer_state_lock, chan, MANUALLY_REMOVING, { $chan_entry.remove() }) - } else { - // We're not supposed to handle monitor updates for unfunded channels (they have no monitors to - // update). Throwing away a monitor update could be dangerous, so we assert even in - // release builds. - panic!("Monitor updates should not exist for non-funded channels"); - } - } } macro_rules! process_events_body { @@ -2551,7 +2524,7 @@ where // Update the monitor with the shutdown script if necessary. if let Some(monitor_update) = monitor_update_opt.take() { handle_new_monitor_update!(self, funding_txo_opt.unwrap(), monitor_update, - peer_state_lock, peer_state, per_peer_state, chan_phase_entry); + peer_state_lock, peer_state, per_peer_state, chan); break; } @@ -3325,7 +3298,7 @@ where }, onion_packet, None, &self.fee_estimator, &self.logger); match break_chan_phase_entry!(self, send_res, chan_phase_entry) { Some(monitor_update) => { - match handle_new_monitor_update!(self, funding_txo, monitor_update, peer_state_lock, peer_state, per_peer_state, chan_phase_entry) { + match handle_new_monitor_update!(self, funding_txo, monitor_update, peer_state_lock, peer_state, per_peer_state, chan) { false => { // Note that MonitorUpdateInProgress here indicates (per function // docs) that we will resend the commitment update once monitor @@ -4524,9 +4497,13 @@ where let peer_state = &mut *peer_state_lock; match peer_state.channel_by_id.entry(funding_txo.to_channel_id()) { hash_map::Entry::Occupied(mut chan_phase) => { - updated_chan = true; - handle_new_monitor_update!(self, funding_txo, update.clone(), - peer_state_lock, peer_state, per_peer_state, chan_phase); + if let ChannelPhase::Funded(chan) = chan_phase.get_mut() { + updated_chan = true; + handle_new_monitor_update!(self, funding_txo, update.clone(), + peer_state_lock, peer_state, per_peer_state, chan); + } else { + debug_assert!(false, "We shouldn't have an update for a non-funded channel"); + } }, hash_map::Entry::Vacant(_) => {}, } @@ -5259,7 +5236,7 @@ where } if !during_init { handle_new_monitor_update!(self, prev_hop.outpoint, monitor_update, peer_state_lock, - peer_state, per_peer_state, chan_phase_entry); + peer_state, per_peer_state, chan); } else { // If we're running during init we cannot update a monitor directly - // they probably haven't actually been loaded yet. Instead, push the @@ -5903,7 +5880,6 @@ where // hasn't persisted to disk yet - we can't lose money on a transaction that we haven't // accepted payment from yet. We do, however, need to wait to send our channel_ready // until we have persisted our monitor. - let new_channel_id = funding_msg.channel_id; peer_state.pending_msg_events.push(events::MessageSendEvent::SendFundingSigned { node_id: counterparty_node_id.clone(), msg: funding_msg, @@ -5911,8 +5887,7 @@ where if let ChannelPhase::Funded(chan) = e.insert(ChannelPhase::Funded(chan)) { handle_new_monitor_update!(self, persist_state, peer_state_lock, peer_state, - per_peer_state, chan, MANUALLY_REMOVING_INITIAL_MONITOR, - { peer_state.channel_by_id.remove(&new_channel_id) }); + per_peer_state, chan, INITIAL_MONITOR); } else { unreachable!("This must be a funded channel as we just inserted it."); } @@ -5947,7 +5922,7 @@ where let monitor = try_chan_phase_entry!(self, chan.funding_signed(&msg, best_block, &self.signer_provider, &self.logger), chan_phase_entry); if let Ok(persist_status) = self.chain_monitor.watch_channel(chan.context.get_funding_txo().unwrap(), monitor) { - handle_new_monitor_update!(self, persist_status, peer_state_lock, peer_state, per_peer_state, chan_phase_entry, INITIAL_MONITOR); + handle_new_monitor_update!(self, persist_status, peer_state_lock, peer_state, per_peer_state, chan, INITIAL_MONITOR); Ok(()) } else { try_chan_phase_entry!(self, Err(ChannelError::Close("Channel funding outpoint was a duplicate".to_owned())), chan_phase_entry) @@ -6054,7 +6029,7 @@ where // Update the monitor with the shutdown script if necessary. if let Some(monitor_update) = monitor_update_opt { handle_new_monitor_update!(self, funding_txo_opt.unwrap(), monitor_update, - peer_state_lock, peer_state, per_peer_state, chan_phase_entry); + peer_state_lock, peer_state, per_peer_state, chan); } }, ChannelPhase::UnfundedInboundV1(_) | ChannelPhase::UnfundedOutboundV1(_) => { @@ -6306,7 +6281,7 @@ where let monitor_update_opt = try_chan_phase_entry!(self, chan.commitment_signed(&msg, &self.logger), chan_phase_entry); if let Some(monitor_update) = monitor_update_opt { handle_new_monitor_update!(self, funding_txo.unwrap(), monitor_update, peer_state_lock, - peer_state, per_peer_state, chan_phase_entry); + peer_state, per_peer_state, chan); } Ok(()) } else { @@ -6480,7 +6455,7 @@ where let funding_txo = funding_txo_opt .expect("Funding outpoint must have been set for RAA handling to succeed"); handle_new_monitor_update!(self, funding_txo, monitor_update, - peer_state_lock, peer_state, per_peer_state, chan_phase_entry); + peer_state_lock, peer_state, per_peer_state, chan); } htlcs_to_fail } else { @@ -6777,10 +6752,8 @@ where if let Some(monitor_update) = monitor_opt { has_monitor_update = true; - let channel_id: ChannelId = *channel_id; handle_new_monitor_update!(self, funding_txo.unwrap(), monitor_update, - peer_state_lock, peer_state, per_peer_state, chan, MANUALLY_REMOVING, - peer_state.channel_by_id.remove(&channel_id)); + peer_state_lock, peer_state, per_peer_state, chan); continue 'peer_loop; } } @@ -7089,7 +7062,6 @@ where /// operation. It will double-check that nothing *else* is also blocking the same channel from /// making progress and then let any blocked [`ChannelMonitorUpdate`]s fly. fn handle_monitor_update_release(&self, counterparty_node_id: PublicKey, channel_funding_outpoint: OutPoint, mut completed_blocker: Option) { - let mut errors = Vec::new(); loop { let per_peer_state = self.per_peer_state.read().unwrap(); if let Some(peer_state_mtx) = per_peer_state.get(&counterparty_node_id) { @@ -7122,7 +7094,7 @@ where log_debug!(self.logger, "Unlocking monitor updating for channel {} and updating monitor", channel_funding_outpoint.to_channel_id()); handle_new_monitor_update!(self, channel_funding_outpoint, monitor_update, - peer_state_lck, peer_state, per_peer_state, chan_phase_entry); + peer_state_lck, peer_state, per_peer_state, chan); if further_update_exists { // If there are more `ChannelMonitorUpdate`s to process, restart at the // top of the loop. @@ -7141,10 +7113,6 @@ where } break; } - for (err, counterparty_node_id) in errors { - let res = Err::<(), _>(err); - let _ = handle_error!(self, res, counterparty_node_id); - } } fn handle_post_event_actions(&self, actions: Vec) { From aa9c601774b11d1ef2958961f7479599625e798b Mon Sep 17 00:00:00 2001 From: Matt Corallo Date: Sun, 10 Sep 2023 20:21:50 +0000 Subject: [PATCH 10/11] Drop doc comments on `ChainMonitor` trait impl methods In general, doc comments on trait impl blocks are not very visible in rustdoc output, and unless they provide useful information they should be elided. Here we drop useless doc comments on `ChainMonitor`'s `Watch` impl methods. --- lightning/src/chain/chainmonitor.rs | 8 -------- 1 file changed, 8 deletions(-) diff --git a/lightning/src/chain/chainmonitor.rs b/lightning/src/chain/chainmonitor.rs index 4986e054a35..7dbf308a568 100644 --- a/lightning/src/chain/chainmonitor.rs +++ b/lightning/src/chain/chainmonitor.rs @@ -654,12 +654,6 @@ where C::Target: chain::Filter, L::Target: Logger, P::Target: Persist, { - /// Adds the monitor that watches the channel referred to by the given outpoint. - /// - /// Calls back to [`chain::Filter`] with the funding transaction and outputs to watch. - /// - /// Note that we persist the given `ChannelMonitor` while holding the `ChainMonitor` - /// monitors lock. fn watch_channel(&self, funding_outpoint: OutPoint, monitor: ChannelMonitor) -> Result { let mut monitors = self.monitors.write().unwrap(); let entry = match monitors.entry(funding_outpoint) { @@ -693,8 +687,6 @@ where C::Target: chain::Filter, Ok(persist_res) } - /// Note that we persist the given `ChannelMonitor` update while holding the - /// `ChainMonitor` monitors lock. fn update_channel(&self, funding_txo: OutPoint, update: &ChannelMonitorUpdate) -> ChannelMonitorUpdateStatus { // Update the monitor that watches the channel referred to by the given outpoint. let monitors = self.monitors.read().unwrap(); From f254c56585688a187a58dc284477cb1cd39b00a9 Mon Sep 17 00:00:00 2001 From: Matt Corallo Date: Thu, 14 Sep 2023 20:02:46 +0000 Subject: [PATCH 11/11] Add an `UnrecoverableError` variant to `ChannelMonitorUpdateStatus` While there is no great way to handle a true failure to persist a `ChannelMonitorUpdate`, it is confusing for users for there to be no error variant at all on an I/O operation. Thus, here we re-add the error variant removed over the past handful of commits, but rather than handle it in a truly unsafe way, we simply panic, optimizing for maximum mutex poisoning to ensure any future operations fail and return immediately. In the future, we may consider changing the handling of this to instead set some "disconnect all peers and fail all operations" bool to give the user a better chance to shutdown in a semi-orderly fashion, but there's only so much that can be done in lightning if we truly cannot persist new updates. --- lightning-persister/src/fs_store.rs | 4 +- lightning/src/chain/chainmonitor.rs | 123 ++++++++++++++++++---- lightning/src/chain/mod.rs | 35 ++++-- lightning/src/ln/channelmanager.rs | 5 + lightning/src/ln/functional_test_utils.rs | 4 + lightning/src/util/persist.rs | 8 +- 6 files changed, 144 insertions(+), 35 deletions(-) diff --git a/lightning-persister/src/fs_store.rs b/lightning-persister/src/fs_store.rs index 6725e16974b..42b28018fe9 100644 --- a/lightning-persister/src/fs_store.rs +++ b/lightning-persister/src/fs_store.rs @@ -470,7 +470,7 @@ mod tests { index: 0 }; match store.persist_new_channel(test_txo, &added_monitors[0].1, update_id.2) { - ChannelMonitorUpdateStatus::InProgress => {}, + ChannelMonitorUpdateStatus::UnrecoverableError => {}, _ => panic!("unexpected result from persisting new channel") } @@ -507,7 +507,7 @@ mod tests { index: 0 }; match store.persist_new_channel(test_txo, &added_monitors[0].1, update_id.2) { - ChannelMonitorUpdateStatus::InProgress => {}, + ChannelMonitorUpdateStatus::UnrecoverableError => {}, _ => panic!("unexpected result from persisting new channel") } diff --git a/lightning/src/chain/chainmonitor.rs b/lightning/src/chain/chainmonitor.rs index 7dbf308a568..b6909cb3e41 100644 --- a/lightning/src/chain/chainmonitor.rs +++ b/lightning/src/chain/chainmonitor.rs @@ -78,26 +78,48 @@ impl MonitorUpdateId { /// `Persist` defines behavior for persisting channel monitors: this could mean /// writing once to disk, and/or uploading to one or more backup services. /// -/// Each method can return two possible values: -/// * If persistence (including any relevant `fsync()` calls) happens immediately, the -/// implementation should return [`ChannelMonitorUpdateStatus::Completed`], indicating normal -/// channel operation should continue. +/// Persistence can happen in one of two ways - synchronously completing before the trait method +/// calls return or asynchronously in the background. /// -/// * If persistence happens asynchronously, implementations can return -/// [`ChannelMonitorUpdateStatus::InProgress`] while the update continues in the background. -/// Once the update completes, [`ChainMonitor::channel_monitor_updated`] should be called with -/// the corresponding [`MonitorUpdateId`]. +/// # For those implementing synchronous persistence /// -/// Note that unlike the direct [`chain::Watch`] interface, -/// [`ChainMonitor::channel_monitor_updated`] must be called once for *each* update which occurs. +/// * If persistence completes fully (including any relevant `fsync()` calls), the implementation +/// should return [`ChannelMonitorUpdateStatus::Completed`], indicating normal channel operation +/// should continue. /// -/// If persistence fails for some reason, implementations should still return -/// [`ChannelMonitorUpdateStatus::InProgress`] and attempt to shut down or otherwise resolve the -/// situation ASAP. +/// * If persistence fails for some reason, implementations should consider returning +/// [`ChannelMonitorUpdateStatus::InProgress`] and retry all pending persistence operations in +/// the background with [`ChainMonitor::list_pending_monitor_updates`] and +/// [`ChainMonitor::get_monitor`]. /// -/// Third-party watchtowers may be built as a part of an implementation of this trait, with the -/// advantage that you can control whether to resume channel operation depending on if an update -/// has been persisted to a watchtower. For this, you may find the following methods useful: +/// Once a full [`ChannelMonitor`] has been persisted, all pending updates for that channel can +/// be marked as complete via [`ChainMonitor::channel_monitor_updated`]. +/// +/// If at some point no further progress can be made towards persisting the pending updates, the +/// node should simply shut down. +/// +/// * If the persistence has failed and cannot be retried further (e.g. because of some timeout), +/// [`ChannelMonitorUpdateStatus::UnrecoverableError`] can be used, though this will result in +/// an immediate panic and future operations in LDK generally failing. +/// +/// # For those implementing asynchronous persistence +/// +/// All calls should generally spawn a background task and immediately return +/// [`ChannelMonitorUpdateStatus::InProgress`]. Once the update completes, +/// [`ChainMonitor::channel_monitor_updated`] should be called with the corresponding +/// [`MonitorUpdateId`]. +/// +/// Note that unlike the direct [`chain::Watch`] interface, +/// [`ChainMonitor::channel_monitor_updated`] must be called once for *each* update which occurs. +/// +/// If at some point no further progress can be made towards persisting a pending update, the node +/// should simply shut down. +/// +/// # Using remote watchtowers +/// +/// Watchtowers may be updated as a part of an implementation of this trait, utilizing the async +/// update process described above while the watchtower is being updated. The following methods are +/// provided for bulding transactions for a watchtower: /// [`ChannelMonitor::initial_counterparty_commitment_tx`], /// [`ChannelMonitor::counterparty_commitment_txs_from_update`], /// [`ChannelMonitor::sign_to_local_justice_tx`], [`TrustedCommitmentTransaction::revokeable_output_index`], @@ -279,11 +301,20 @@ where C::Target: chain::Filter, where FN: Fn(&ChannelMonitor, &TransactionData) -> Vec { + let err_str = "ChannelMonitor[Update] persistence failed unrecoverably. This indicates we cannot continue normal operation and must shut down."; let funding_outpoints: HashSet = HashSet::from_iter(self.monitors.read().unwrap().keys().cloned()); for funding_outpoint in funding_outpoints.iter() { let monitor_lock = self.monitors.read().unwrap(); if let Some(monitor_state) = monitor_lock.get(funding_outpoint) { - self.update_monitor_with_chain_data(header, best_height, txdata, &process, funding_outpoint, &monitor_state); + if self.update_monitor_with_chain_data(header, best_height, txdata, &process, funding_outpoint, &monitor_state).is_err() { + // Take the monitors lock for writing so that we poison it and any future + // operations going forward fail immediately. + core::mem::drop(monitor_state); + core::mem::drop(monitor_lock); + let _poison = self.monitors.write().unwrap(); + log_error!(self.logger, "{}", err_str); + panic!("{}", err_str); + } } } @@ -291,7 +322,10 @@ where C::Target: chain::Filter, let monitor_states = self.monitors.write().unwrap(); for (funding_outpoint, monitor_state) in monitor_states.iter() { if !funding_outpoints.contains(funding_outpoint) { - self.update_monitor_with_chain_data(header, best_height, txdata, &process, funding_outpoint, &monitor_state); + if self.update_monitor_with_chain_data(header, best_height, txdata, &process, funding_outpoint, &monitor_state).is_err() { + log_error!(self.logger, "{}", err_str); + panic!("{}", err_str); + } } } @@ -306,7 +340,10 @@ where C::Target: chain::Filter, } } - fn update_monitor_with_chain_data(&self, header: &BlockHeader, best_height: Option, txdata: &TransactionData, process: FN, funding_outpoint: &OutPoint, monitor_state: &MonitorHolder) where FN: Fn(&ChannelMonitor, &TransactionData) -> Vec { + fn update_monitor_with_chain_data( + &self, header: &BlockHeader, best_height: Option, txdata: &TransactionData, + process: FN, funding_outpoint: &OutPoint, monitor_state: &MonitorHolder + ) -> Result<(), ()> where FN: Fn(&ChannelMonitor, &TransactionData) -> Vec { let monitor = &monitor_state.monitor; let mut txn_outputs; { @@ -331,7 +368,10 @@ where C::Target: chain::Filter, ChannelMonitorUpdateStatus::InProgress => { log_debug!(self.logger, "Channel Monitor sync for channel {} in progress, holding events until completion!", log_funding_info!(monitor)); pending_monitor_updates.push(update_id); - } + }, + ChannelMonitorUpdateStatus::UnrecoverableError => { + return Err(()); + }, } } @@ -351,6 +391,7 @@ where C::Target: chain::Filter, } } } + Ok(()) } /// Creates a new `ChainMonitor` used to watch on-chain activity pertaining to channels. @@ -674,7 +715,12 @@ where C::Target: chain::Filter, }, ChannelMonitorUpdateStatus::Completed => { log_info!(self.logger, "Persistence of new ChannelMonitor for channel {} completed", log_funding_info!(monitor)); - } + }, + ChannelMonitorUpdateStatus::UnrecoverableError => { + let err_str = "ChannelMonitor[Update] persistence failed unrecoverably. This indicates we cannot continue normal operation and must shut down."; + log_error!(self.logger, "{}", err_str); + panic!("{}", err_str); + }, } if let Some(ref chain_source) = self.chain_source { monitor.load_outputs_to_watch(chain_source); @@ -690,7 +736,7 @@ where C::Target: chain::Filter, fn update_channel(&self, funding_txo: OutPoint, update: &ChannelMonitorUpdate) -> ChannelMonitorUpdateStatus { // Update the monitor that watches the channel referred to by the given outpoint. let monitors = self.monitors.read().unwrap(); - match monitors.get(&funding_txo) { + let ret = match monitors.get(&funding_txo) { None => { log_error!(self.logger, "Failed to update channel monitor: no such monitor registered"); @@ -722,6 +768,7 @@ where C::Target: chain::Filter, ChannelMonitorUpdateStatus::Completed => { log_debug!(self.logger, "Persistence of ChannelMonitorUpdate for channel {} completed", log_funding_info!(monitor)); }, + ChannelMonitorUpdateStatus::UnrecoverableError => { /* we'll panic in a moment */ }, } if update_res.is_err() { ChannelMonitorUpdateStatus::InProgress @@ -729,7 +776,17 @@ where C::Target: chain::Filter, persist_res } } + }; + if let ChannelMonitorUpdateStatus::UnrecoverableError = ret { + // Take the monitors lock for writing so that we poison it and any future + // operations going forward fail immediately. + core::mem::drop(monitors); + let _poison = self.monitors.write().unwrap(); + let err_str = "ChannelMonitor[Update] persistence failed unrecoverably. This indicates we cannot continue normal operation and must shut down."; + log_error!(self.logger, "{}", err_str); + panic!("{}", err_str); } + ret } fn release_pending_monitor_events(&self) -> Vec<(OutPoint, Vec, Option)> { @@ -973,4 +1030,26 @@ mod tests { do_chainsync_pauses_events(false); do_chainsync_pauses_events(true); } + + #[test] + #[cfg(feature = "std")] + fn update_during_chainsync_poisons_channel() { + let chanmon_cfgs = create_chanmon_cfgs(2); + let node_cfgs = create_node_cfgs(2, &chanmon_cfgs); + let node_chanmgrs = create_node_chanmgrs(2, &node_cfgs, &[None, None]); + let nodes = create_network(2, &node_cfgs, &node_chanmgrs); + create_announced_chan_between_nodes(&nodes, 0, 1); + + chanmon_cfgs[0].persister.chain_sync_monitor_persistences.lock().unwrap().clear(); + chanmon_cfgs[0].persister.set_update_ret(ChannelMonitorUpdateStatus::UnrecoverableError); + + assert!(std::panic::catch_unwind(|| { + // Returning an UnrecoverableError should always panic immediately + connect_blocks(&nodes[0], 1); + }).is_err()); + assert!(std::panic::catch_unwind(|| { + // ...and also poison our locks causing later use to panic as well + core::mem::drop(nodes); + }).is_err()); + } } diff --git a/lightning/src/chain/mod.rs b/lightning/src/chain/mod.rs index 1abaa77f3e4..89e0b155cf6 100644 --- a/lightning/src/chain/mod.rs +++ b/lightning/src/chain/mod.rs @@ -177,9 +177,14 @@ pub trait Confirm { /// An enum representing the status of a channel monitor update persistence. /// -/// Note that there is no error variant - any failure to persist a [`ChannelMonitor`] should be -/// retried indefinitely, the node shut down (as if we cannot update stored data we can't do much -/// of anything useful). +/// These are generally used as the return value for an implementation of [`Persist`] which is used +/// as the storage layer for a [`ChainMonitor`]. See the docs on [`Persist`] for a high-level +/// explanation of how to handle different cases. +/// +/// While `UnrecoverableError` is provided as a failure variant, it is not truly "handled" on the +/// calling side, and generally results in an immediate panic. For those who prefer to avoid +/// panics, `InProgress` can be used and you can retry the update operation in the background or +/// shut down cleanly. /// /// Note that channels should generally *not* be force-closed after a persistence failure. /// Force-closing with the latest [`ChannelMonitorUpdate`] applied may result in a transaction @@ -187,6 +192,8 @@ pub trait Confirm { /// latest [`ChannelMonitor`] is not durably persisted anywhere and exists only in memory, naively /// calling [`ChannelManager::force_close_broadcasting_latest_txn`] *may result in loss of funds*! /// +/// [`Persist`]: chainmonitor::Persist +/// [`ChainMonitor`]: chainmonitor::ChainMonitor /// [`ChannelManager::force_close_broadcasting_latest_txn`]: crate::ln::channelmanager::ChannelManager::force_close_broadcasting_latest_txn #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub enum ChannelMonitorUpdateStatus { @@ -212,8 +219,8 @@ pub enum ChannelMonitorUpdateStatus { /// until a [`MonitorEvent::Completed`] is provided, even if you return no error on a later /// monitor update for the same channel. /// - /// For deployments where a copy of ChannelMonitors and other local state are backed up in a - /// remote location (with local copies persisted immediately), it is anticipated that all + /// For deployments where a copy of [`ChannelMonitor`]s and other local state are backed up in + /// a remote location (with local copies persisted immediately), it is anticipated that all /// updates will return [`InProgress`] until the remote copies could be updated. /// /// Note that while fully asynchronous persistence of [`ChannelMonitor`] data is generally @@ -222,6 +229,18 @@ pub enum ChannelMonitorUpdateStatus { /// /// [`InProgress`]: ChannelMonitorUpdateStatus::InProgress InProgress, + /// Indicates that an update has failed and will not complete at any point in the future. + /// + /// Currently returning this variant will cause LDK to immediately panic to encourage immediate + /// shutdown. In the future this may be updated to disconnect peers and refuse to continue + /// normal operation without a panic. + /// + /// Applications which wish to perform an orderly shutdown after failure should consider + /// returning [`InProgress`] instead and simply shut down without ever marking the update + /// complete. + /// + /// [`InProgress`]: ChannelMonitorUpdateStatus::InProgress + UnrecoverableError, } /// The `Watch` trait defines behavior for watching on-chain activity pertaining to channels as @@ -261,8 +280,10 @@ pub trait Watch { /// on-chain or the [`ChannelMonitor`] having decided to do so and broadcasted a transaction), /// and the [`ChannelManager`] state will be updated once it sees the funding spend on-chain. /// - /// If persistence fails, this should return [`ChannelMonitorUpdateStatus::InProgress`] and - /// the node should shut down immediately. + /// In general, persistence failures should be retried after returning + /// [`ChannelMonitorUpdateStatus::InProgress`] and eventually complete. If a failure truly + /// cannot be retried, the node should shut down immediately after returning + /// [`ChannelMonitorUpdateStatus::UnrecoverableError`], see its documentation for more info. /// /// [`ChannelManager`]: crate::ln::channelmanager::ChannelManager fn update_channel(&self, funding_txo: OutPoint, update: &ChannelMonitorUpdate) -> ChannelMonitorUpdateStatus; diff --git a/lightning/src/ln/channelmanager.rs b/lightning/src/ln/channelmanager.rs index 747a78e8c91..8b524d8f3f4 100644 --- a/lightning/src/ln/channelmanager.rs +++ b/lightning/src/ln/channelmanager.rs @@ -2045,6 +2045,11 @@ macro_rules! handle_new_monitor_update { ($self: ident, $update_res: expr, $chan: expr, _internal, $completed: expr) => { { debug_assert!($self.background_events_processed_since_startup.load(Ordering::Acquire)); match $update_res { + ChannelMonitorUpdateStatus::UnrecoverableError => { + let err_str = "ChannelMonitor[Update] persistence failed unrecoverably. This indicates we cannot continue normal operation and must shut down."; + log_error!($self.logger, "{}", err_str); + panic!("{}", err_str); + }, ChannelMonitorUpdateStatus::InProgress => { log_debug!($self.logger, "ChannelMonitor update for {} in flight, holding messages until the update completes.", &$chan.context.channel_id()); diff --git a/lightning/src/ln/functional_test_utils.rs b/lightning/src/ln/functional_test_utils.rs index 2c4c3c0c267..5a1a21e2999 100644 --- a/lightning/src/ln/functional_test_utils.rs +++ b/lightning/src/ln/functional_test_utils.rs @@ -422,6 +422,10 @@ pub struct Node<'chan_man, 'node_cfg: 'chan_man, 'chan_mon_cfg: 'node_cfg> { &'chan_mon_cfg test_utils::TestLogger, >, } +#[cfg(feature = "std")] +impl<'a, 'b, 'c> std::panic::UnwindSafe for Node<'a, 'b, 'c> {} +#[cfg(feature = "std")] +impl<'a, 'b, 'c> std::panic::RefUnwindSafe for Node<'a, 'b, 'c> {} impl<'a, 'b, 'c> Node<'a, 'b, 'c> { pub fn best_block_hash(&self) -> BlockHash { self.blocks.lock().unwrap().last().unwrap().0.block_hash() diff --git a/lightning/src/util/persist.rs b/lightning/src/util/persist.rs index af66e1233ab..431c62c9fb8 100644 --- a/lightning/src/util/persist.rs +++ b/lightning/src/util/persist.rs @@ -174,8 +174,8 @@ impl<'a, A: KVStore, M: Deref, T: Deref, ES: Deref, NS: Deref, SP: Deref, F: Der impl Persist for K { // TODO: We really need a way for the persister to inform the user that its time to crash/shut // down once these start returning failure. - // An InProgress result implies we should probably just shut down the node since we're not - // retrying persistence! + // Then we should return InProgress rather than UnrecoverableError, implying we should probably + // just shut down the node since we're not retrying persistence! fn persist_new_channel(&self, funding_txo: OutPoint, monitor: &ChannelMonitor, _update_id: MonitorUpdateId) -> chain::ChannelMonitorUpdateStatus { let key = format!("{}_{}", funding_txo.txid.to_hex(), funding_txo.index); @@ -185,7 +185,7 @@ impl Persist chain::ChannelMonitorUpdateStatus::Completed, - Err(_) => chain::ChannelMonitorUpdateStatus::InProgress + Err(_) => chain::ChannelMonitorUpdateStatus::UnrecoverableError } } @@ -197,7 +197,7 @@ impl Persist chain::ChannelMonitorUpdateStatus::Completed, - Err(_) => chain::ChannelMonitorUpdateStatus::InProgress + Err(_) => chain::ChannelMonitorUpdateStatus::UnrecoverableError } } }