Skip to content

Commit 45a544d

Browse files
authored
feat: Document and clarify vector graceful shutdown (#681)
* WIP * WIP * WIP * fixup * switch back to ignoring sigterm * Add COMMON_BASH_TRAP_FUNCTIONS * fix: test * Move docs on vector shutdown to Rust comment instead of bash comment * document COMMON_BASH_TRAP_FUNCTIONS * fix test * Add main container to vector container rustdoc * changelog * typo
1 parent 0ac98f0 commit 45a544d

File tree

3 files changed

+126
-13
lines changed

3 files changed

+126
-13
lines changed

CHANGELOG.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,16 @@ All notable changes to this project will be documented in this file.
44

55
## [Unreleased]
66

7+
### Added
8+
9+
- Added `COMMON_BASH_TRAP_FUNCTIONS`, which can be used to write a Vector shutdown trigger file after the main application stopped ([#681]).
10+
11+
### Changed
12+
13+
- BREAKING: Rename `product_logging::framework::shutdown_vector_command` to `create_vector_shutdown_file_command` and added `remove_vector_shutdown_file_command` ([#681]).
14+
15+
[#681]: https://github.com/stackabletech/operator-rs/pull/681
16+
717
## [0.55.0] - 2023-10-16
818

919
### Added

src/product_logging/framework.rs

Lines changed: 71 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1007,16 +1007,30 @@ sinks:
10071007
)
10081008
}
10091009

1010-
/// Create the specification of the Vector log agent container
1010+
/// Create the specification of the Vector log agent container.
1011+
///
1012+
/// The vector process is not running as PID 1, so a Kubernetes SIGTERM will be have no effect.
1013+
/// Instead, the vector process can be shut down by creating a file below {STACKABLE_LOG_DIR}/{VECTOR_LOG_DIR},
1014+
/// e.g. {STACKABLE_LOG_DIR}/{VECTOR_LOG_DIR}/{SHUTDOWN_FILE}. This way logs from the products will always be shipped,
1015+
/// as the vector container will be the last one to terminate. A specific container must be chosen, which has the responsibility
1016+
/// to create a file after it has properly shut down. It should be the one taking the longest to shut down.
1017+
/// E.g. for hdfs the lifetime of vector will be bound to the datanode container and not to the zkfc container.
1018+
/// We *could* have different shutdown trigger files for all application containers and wait for all containers
1019+
/// to terminate, but that seems rather complicated and will be added once needed. Additionally, you should remove
1020+
/// the shutdown marker file on startup of the application, as the application container can crash for any reason
1021+
/// and get restarted. If you don't remove the shutdown file on startup, the vector container will crashloop forever,
1022+
/// as it will start and shut down immediately after!
10111023
///
10121024
/// ```
10131025
/// use stackable_operator::{
10141026
/// builder::{
1027+
/// ContainerBuilder,
10151028
/// meta::ObjectMetaBuilder,
10161029
/// PodBuilder,
10171030
/// resources::ResourceRequirementsBuilder
10181031
/// },
1019-
/// product_logging,
1032+
/// product_logging::{self, framework:: {create_vector_shutdown_file_command, remove_vector_shutdown_file_command}},
1033+
/// utils::COMMON_BASH_TRAP_FUNCTIONS,
10201034
/// };
10211035
/// use k8s_openapi::apimachinery::pkg::api::resource::Quantity;
10221036
/// # use stackable_operator::{
@@ -1026,6 +1040,8 @@ sinks:
10261040
/// # };
10271041
/// # use strum::{Display, EnumIter};
10281042
/// #
1043+
/// # pub const STACKABLE_LOG_DIR: &str = "/stackable/log";
1044+
/// #
10291045
/// # #[derive(Clone, Display, Eq, EnumIter, Ord, PartialEq, PartialOrd)]
10301046
/// # pub enum Container {
10311047
/// # Vector,
@@ -1051,6 +1067,27 @@ sinks:
10511067
/// .with_memory_limit("1Gi")
10521068
/// .build();
10531069
///
1070+
/// pod_builder.add_container(
1071+
/// ContainerBuilder::new("application")
1072+
/// .unwrap()
1073+
/// .image_from_product_image(&resolved_product_image)
1074+
/// .args(vec![format!(
1075+
/// "\
1076+
/// {COMMON_BASH_TRAP_FUNCTIONS}
1077+
/// {remove_vector_shutdown_file_command}
1078+
/// prepare_signal_handlers
1079+
/// my-application start &
1080+
/// wait_for_termination $!
1081+
/// {create_vector_shutdown_file_command}
1082+
/// ",
1083+
/// remove_vector_shutdown_file_command =
1084+
/// remove_vector_shutdown_file_command(STACKABLE_LOG_DIR),
1085+
/// create_vector_shutdown_file_command =
1086+
/// create_vector_shutdown_file_command(STACKABLE_LOG_DIR),
1087+
/// )])
1088+
/// .build(),
1089+
/// );
1090+
///
10541091
/// if logging.enable_vector_agent {
10551092
/// pod_builder.add_container(product_logging::framework::vector_container(
10561093
/// &resolved_product_image,
@@ -1082,15 +1119,28 @@ pub fn vector_container(
10821119
ContainerBuilder::new("vector")
10831120
.unwrap()
10841121
.image_from_product_image(image)
1085-
.command(vec!["bash".into(), "-c".into()])
1122+
.command(vec![
1123+
"/bin/bash".to_string(),
1124+
"-x".to_string(),
1125+
"-euo".to_string(),
1126+
"pipefail".to_string(),
1127+
"-c".to_string(),
1128+
])
1129+
// The following code is an alternative approach which can get SIGTERM terminated as well as via writing a file.
1130+
// It is left in here, as it needed some effort to get it right and can be helpful in the future.
1131+
// bash -c 'sleep 1 && if [ ! -f \"{STACKABLE_LOG_DIR}/{VECTOR_LOG_DIR}/{SHUTDOWN_FILE}\" ]; then mkdir -p {STACKABLE_LOG_DIR}/{VECTOR_LOG_DIR} && inotifywait -qq --event create {STACKABLE_LOG_DIR}/{VECTOR_LOG_DIR}; fi && kill 1' &
1132+
// exec vector --config {STACKABLE_CONFIG_DIR}/{VECTOR_CONFIG_FILE}
10861133
.args(vec![format!(
10871134
"\
1088-
vector --config {STACKABLE_CONFIG_DIR}/{VECTOR_CONFIG_FILE} & vector_pid=$! && \
1089-
if [ ! -f \"{STACKABLE_LOG_DIR}/{VECTOR_LOG_DIR}/{SHUTDOWN_FILE}\" ]; then \
1090-
mkdir -p {STACKABLE_LOG_DIR}/{VECTOR_LOG_DIR} && \
1091-
inotifywait -qq --event create {STACKABLE_LOG_DIR}/{VECTOR_LOG_DIR}; \
1092-
fi && \
1093-
kill $vector_pid"
1135+
# Vector will ignore SIGTERM (as PID != 1) and must be shut down by writing a shutdown trigger file
1136+
vector --config {STACKABLE_CONFIG_DIR}/{VECTOR_CONFIG_FILE} & vector_pid=$!
1137+
if [ ! -f \"{STACKABLE_LOG_DIR}/{VECTOR_LOG_DIR}/{SHUTDOWN_FILE}\" ]; then
1138+
mkdir -p {STACKABLE_LOG_DIR}/{VECTOR_LOG_DIR} && \
1139+
inotifywait -qq --event create {STACKABLE_LOG_DIR}/{VECTOR_LOG_DIR}; \
1140+
fi
1141+
sleep 1
1142+
kill $vector_pid
1143+
"
10941144
)])
10951145
.add_env_var("VECTOR_LOG", log_level.to_vector_literal())
10961146
.add_volume_mount(config_volume_name, STACKABLE_CONFIG_DIR)
@@ -1099,7 +1149,8 @@ kill $vector_pid"
10991149
.build()
11001150
}
11011151

1102-
/// Command to shut down the Vector instance
1152+
/// Command to create a shutdown file for the vector container.
1153+
/// Please delete it before starting your application using [`remove_vector_shutdown_file_command`].
11031154
///
11041155
/// # Example
11051156
///
@@ -1112,8 +1163,9 @@ kill $vector_pid"
11121163
/// const STACKABLE_LOG_DIR: &str = "/stackable/log";
11131164
///
11141165
/// let args = vec![
1115-
/// "echo Perform initialization tasks ...".into(),
1116-
/// product_logging::framework::shutdown_vector_command(STACKABLE_LOG_DIR),
1166+
/// product_logging::framework::remove_vector_shutdown_file_command(STACKABLE_LOG_DIR),
1167+
/// "echo Perform some tasks ...".into(),
1168+
/// product_logging::framework::create_vector_shutdown_file_command(STACKABLE_LOG_DIR),
11171169
/// ];
11181170
///
11191171
/// let container = ContainerBuilder::new("init")
@@ -1124,13 +1176,19 @@ kill $vector_pid"
11241176
/// .add_volume_mount("log", STACKABLE_LOG_DIR)
11251177
/// .build();
11261178
/// ```
1127-
pub fn shutdown_vector_command(stackable_log_dir: &str) -> String {
1179+
pub fn create_vector_shutdown_file_command(stackable_log_dir: &str) -> String {
11281180
format!(
11291181
"mkdir -p {stackable_log_dir}/{VECTOR_LOG_DIR} && \
11301182
touch {stackable_log_dir}/{VECTOR_LOG_DIR}/{SHUTDOWN_FILE}"
11311183
)
11321184
}
11331185

1186+
/// Use this command to remove the shutdown file (if it exists) created by [`create_vector_shutdown_file_command`].
1187+
/// You should execute this command before starting your application.
1188+
pub fn remove_vector_shutdown_file_command(stackable_log_dir: &str) -> String {
1189+
format!("rm -f {stackable_log_dir}/{VECTOR_LOG_DIR}/{SHUTDOWN_FILE}")
1190+
}
1191+
11341192
#[cfg(test)]
11351193
mod tests {
11361194
use super::*;

src/utils.rs

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,50 @@
11
use tracing::info;
22

3+
/// This is a bash snippet, which adds two functions out of interest:
4+
///
5+
/// 1. `prepare_signal_handlers` call this first to set up the needed traps
6+
/// 2. `wait_for_termination` waits for the PID you passed as the first argument to terminate
7+
///
8+
/// An example use could be
9+
/// ```text
10+
/// {COMMON_BASH_TRAP_FUNCTIONS}
11+
/// echo "Run before startup"
12+
/// prepare_signal_handlers
13+
/// {hadoop_home}/bin/hdfs {role} &
14+
/// wait_for_termination $!
15+
/// echo "Run after termination"
16+
/// ```
17+
pub const COMMON_BASH_TRAP_FUNCTIONS: &str = r#"
18+
prepare_signal_handlers()
19+
{
20+
unset term_child_pid
21+
unset term_kill_needed
22+
trap 'handle_term_signal' TERM
23+
}
24+
25+
handle_term_signal()
26+
{
27+
if [ "${term_child_pid}" ]; then
28+
kill -TERM "${term_child_pid}" 2>/dev/null
29+
else
30+
term_kill_needed="yes"
31+
fi
32+
}
33+
34+
wait_for_termination()
35+
{
36+
set +e
37+
term_child_pid=$1
38+
if [[ -v term_kill_needed ]]; then
39+
kill -TERM "${term_child_pid}" 2>/dev/null
40+
fi
41+
wait ${term_child_pid} 2>/dev/null
42+
trap - TERM
43+
wait ${term_child_pid} 2>/dev/null
44+
set -e
45+
}
46+
"#;
47+
348
/// Prints helpful and standardized diagnostic messages.
449
///
550
/// This method is meant to be called first thing in the `main` method of an Operator.

0 commit comments

Comments
 (0)