Skip to content

feat: Document and clarify vector graceful shutdown #681

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 15 commits into from
Oct 31, 2023
Merged
55 changes: 44 additions & 11 deletions src/product_logging/framework.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1082,15 +1082,40 @@ pub fn vector_container(
ContainerBuilder::new("vector")
.unwrap()
.image_from_product_image(image)
.command(vec!["bash".into(), "-c".into()])
.command(vec![
"/bin/bash".to_string(),
"-x".to_string(),
"-euo".to_string(),
"pipefail".to_string(),
"-c".to_string(),
])
.args(vec![format!(
"\
vector --config {STACKABLE_CONFIG_DIR}/{VECTOR_CONFIG_FILE} & vector_pid=$! && \
if [ ! -f \"{STACKABLE_LOG_DIR}/{VECTOR_LOG_DIR}/{SHUTDOWN_FILE}\" ]; then \
mkdir -p {STACKABLE_LOG_DIR}/{VECTOR_LOG_DIR} && \
inotifywait -qq --event create {STACKABLE_LOG_DIR}/{VECTOR_LOG_DIR}; \
fi && \
kill $vector_pid"
# The vector process is not running as PID 1, so a Kubernetes SIGTERM will be have no effect.
# Instead, the vector process can be shut down by creating a file below {STACKABLE_LOG_DIR}/{VECTOR_LOG_DIR},
# e.g. {STACKABLE_LOG_DIR}/{VECTOR_LOG_DIR}/{SHUTDOWN_FILE}.
# This way logs from the products will always be shipped, as the vector container will be the last one to terminate.
# A specific container must be chosen, which has the responsibility to create a file after it has
# properly shut down. It should be the one taking the longest to shut down.
# E.g. for hdfs the lifetime of vector will be bound to the datanode container and not to the zkfc container.
# We *could* have different shutdown trigger files for all application containers and wait for all containers
# to terminate, but that seems rather complicated and will be added once needed.
# Additionally, you should remove the shutdown marker file on startup of the application, as the application
# container can crash for any reason and get restarted. If you don't remove the shutdown file on startup,
# the vector container will crashloop forever as it will start and shut down immediately after!

# ALTERNATIVE, which can get SIGTERM terminated as well as via writing a file (?)
# bash -c 'sleep 1 && if [ ! -f \"{STACKABLE_LOG_DIR}/{VECTOR_LOG_DIR}/{SHUTDOWN_FILE}\" ]; then mkdir -p {STACKABLE_LOG_DIR}/{VECTOR_LOG_DIR} && inotifywait -qq --event create {STACKABLE_LOG_DIR}/{VECTOR_LOG_DIR}; fi && kill 1' &
# exec vector --config {STACKABLE_CONFIG_DIR}/{VECTOR_CONFIG_FILE}

vector --config {STACKABLE_CONFIG_DIR}/{VECTOR_CONFIG_FILE} & vector_pid=$!
if [ ! -f \"{STACKABLE_LOG_DIR}/{VECTOR_LOG_DIR}/{SHUTDOWN_FILE}\" ]; then
mkdir -p {STACKABLE_LOG_DIR}/{VECTOR_LOG_DIR} && \
inotifywait -qq --event create {STACKABLE_LOG_DIR}/{VECTOR_LOG_DIR}; \
fi
sleep 1
kill $vector_pid
"
)])
.add_env_var("VECTOR_LOG", log_level.to_vector_literal())
.add_volume_mount(config_volume_name, STACKABLE_CONFIG_DIR)
Expand All @@ -1099,7 +1124,8 @@ kill $vector_pid"
.build()
}

/// Command to shut down the Vector instance
/// Command to create a shutdown file for the vector container.
/// Please delete it before starting your application using [`remove_vector_shutdown_file_command`].
///
/// # Example
///
Expand All @@ -1112,8 +1138,9 @@ kill $vector_pid"
/// const STACKABLE_LOG_DIR: &str = "/stackable/log";
///
/// let args = vec![
/// "echo Perform initialization tasks ...".into(),
/// product_logging::framework::shutdown_vector_command(STACKABLE_LOG_DIR),
/// product_logging::framework::remove_vector_shutdown_file_command(STACKABLE_LOG_DIR),
/// "echo Perform some tasks ...".into(),
/// product_logging::framework::create_vector_shutdown_file_command(STACKABLE_LOG_DIR),
/// ];
///
/// let container = ContainerBuilder::new("init")
Expand All @@ -1124,13 +1151,19 @@ kill $vector_pid"
/// .add_volume_mount("log", STACKABLE_LOG_DIR)
/// .build();
/// ```
pub fn shutdown_vector_command(stackable_log_dir: &str) -> String {
pub fn create_vector_shutdown_file_command(stackable_log_dir: &str) -> String {
format!(
"mkdir -p {stackable_log_dir}/{VECTOR_LOG_DIR} && \
touch {stackable_log_dir}/{VECTOR_LOG_DIR}/{SHUTDOWN_FILE}"
)
}

/// Use this command to remove the shutdown file (if it exists) created by [`create_vector_shutdown_file_command`].
/// You should execute this command before starting your application.
pub fn remove_vector_shutdown_file_command(stackable_log_dir: &str) -> String {
format!("rm -f {stackable_log_dir}/{VECTOR_LOG_DIR}/{SHUTDOWN_FILE}")
}

#[cfg(test)]
mod tests {
use super::*;
Expand Down
31 changes: 31 additions & 0 deletions src/utils.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,36 @@
use tracing::info;

pub const COMMON_BASH_TRAP_FUNCTIONS: &str = r#"
prepare_signal_handlers()
{
unset term_child_pid
unset term_kill_needed
trap 'handle_term' TERM
}

handle_term()
{
if [ "${term_child_pid}" ]; then
kill -TERM "${term_child_pid}" 2>/dev/null
else
term_kill_needed="yes"
fi
}

wait_for_termination()
{
set +e
term_child_pid=$!
if [[ -v term_kill_needed ]]; then
kill -TERM "${term_child_pid}" 2>/dev/null
fi
wait ${term_child_pid} 2>/dev/null
trap - TERM
wait ${term_child_pid} 2>/dev/null
set -e
}
"#;

/// Prints helpful and standardized diagnostic messages.
///
/// This method is meant to be called first thing in the `main` method of an Operator.
Expand Down