diff --git a/CHANGELOG.md b/CHANGELOG.md index 238ce2084..b390b38af 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,16 @@ All notable changes to this project will be documented in this file. ## [Unreleased] +### Added + +- Added `COMMON_BASH_TRAP_FUNCTIONS`, which can be used to write a Vector shutdown trigger file after the main application stopped ([#681]). + +### Changed + +- BREAKING: Rename `product_logging::framework::shutdown_vector_command` to `create_vector_shutdown_file_command` and added `remove_vector_shutdown_file_command` ([#681]). + +[#681]: https://github.com/stackabletech/operator-rs/pull/681 + ## [0.55.0] - 2023-10-16 ### Added diff --git a/src/product_logging/framework.rs b/src/product_logging/framework.rs index d7ae6db0c..b3ba66d7c 100644 --- a/src/product_logging/framework.rs +++ b/src/product_logging/framework.rs @@ -1007,16 +1007,30 @@ sinks: ) } -/// Create the specification of the Vector log agent container +/// Create the specification of the Vector log agent container. +/// +/// The vector process is not running as PID 1, so a Kubernetes SIGTERM will be have no effect. +/// Instead, the vector process can be shut down by creating a file below {STACKABLE_LOG_DIR}/{VECTOR_LOG_DIR}, +/// e.g. {STACKABLE_LOG_DIR}/{VECTOR_LOG_DIR}/{SHUTDOWN_FILE}. This way logs from the products will always be shipped, +/// as the vector container will be the last one to terminate. A specific container must be chosen, which has the responsibility +/// to create a file after it has properly shut down. It should be the one taking the longest to shut down. +/// E.g. for hdfs the lifetime of vector will be bound to the datanode container and not to the zkfc container. +/// We *could* have different shutdown trigger files for all application containers and wait for all containers +/// to terminate, but that seems rather complicated and will be added once needed. Additionally, you should remove +/// the shutdown marker file on startup of the application, as the application container can crash for any reason +/// and get restarted. If you don't remove the shutdown file on startup, the vector container will crashloop forever, +/// as it will start and shut down immediately after! /// /// ``` /// use stackable_operator::{ /// builder::{ +/// ContainerBuilder, /// meta::ObjectMetaBuilder, /// PodBuilder, /// resources::ResourceRequirementsBuilder /// }, -/// product_logging, +/// product_logging::{self, framework:: {create_vector_shutdown_file_command, remove_vector_shutdown_file_command}}, +/// utils::COMMON_BASH_TRAP_FUNCTIONS, /// }; /// use k8s_openapi::apimachinery::pkg::api::resource::Quantity; /// # use stackable_operator::{ @@ -1026,6 +1040,8 @@ sinks: /// # }; /// # use strum::{Display, EnumIter}; /// # +/// # pub const STACKABLE_LOG_DIR: &str = "/stackable/log"; +/// # /// # #[derive(Clone, Display, Eq, EnumIter, Ord, PartialEq, PartialOrd)] /// # pub enum Container { /// # Vector, @@ -1051,6 +1067,27 @@ sinks: /// .with_memory_limit("1Gi") /// .build(); /// +/// pod_builder.add_container( +/// ContainerBuilder::new("application") +/// .unwrap() +/// .image_from_product_image(&resolved_product_image) +/// .args(vec![format!( +/// "\ +/// {COMMON_BASH_TRAP_FUNCTIONS} +/// {remove_vector_shutdown_file_command} +/// prepare_signal_handlers +/// my-application start & +/// wait_for_termination $! +/// {create_vector_shutdown_file_command} +/// ", +/// remove_vector_shutdown_file_command = +/// remove_vector_shutdown_file_command(STACKABLE_LOG_DIR), +/// create_vector_shutdown_file_command = +/// create_vector_shutdown_file_command(STACKABLE_LOG_DIR), +/// )]) +/// .build(), +/// ); +/// /// if logging.enable_vector_agent { /// pod_builder.add_container(product_logging::framework::vector_container( /// &resolved_product_image, @@ -1082,15 +1119,28 @@ pub fn vector_container( ContainerBuilder::new("vector") .unwrap() .image_from_product_image(image) - .command(vec!["bash".into(), "-c".into()]) + .command(vec![ + "/bin/bash".to_string(), + "-x".to_string(), + "-euo".to_string(), + "pipefail".to_string(), + "-c".to_string(), + ]) + // The following code is an alternative approach which can get SIGTERM terminated as well as via writing a file. + // It is left in here, as it needed some effort to get it right and can be helpful in the future. + // bash -c 'sleep 1 && if [ ! -f \"{STACKABLE_LOG_DIR}/{VECTOR_LOG_DIR}/{SHUTDOWN_FILE}\" ]; then mkdir -p {STACKABLE_LOG_DIR}/{VECTOR_LOG_DIR} && inotifywait -qq --event create {STACKABLE_LOG_DIR}/{VECTOR_LOG_DIR}; fi && kill 1' & + // exec vector --config {STACKABLE_CONFIG_DIR}/{VECTOR_CONFIG_FILE} .args(vec![format!( "\ -vector --config {STACKABLE_CONFIG_DIR}/{VECTOR_CONFIG_FILE} & vector_pid=$! && \ -if [ ! -f \"{STACKABLE_LOG_DIR}/{VECTOR_LOG_DIR}/{SHUTDOWN_FILE}\" ]; then \ -mkdir -p {STACKABLE_LOG_DIR}/{VECTOR_LOG_DIR} && \ -inotifywait -qq --event create {STACKABLE_LOG_DIR}/{VECTOR_LOG_DIR}; \ -fi && \ -kill $vector_pid" +# Vector will ignore SIGTERM (as PID != 1) and must be shut down by writing a shutdown trigger file +vector --config {STACKABLE_CONFIG_DIR}/{VECTOR_CONFIG_FILE} & vector_pid=$! +if [ ! -f \"{STACKABLE_LOG_DIR}/{VECTOR_LOG_DIR}/{SHUTDOWN_FILE}\" ]; then + mkdir -p {STACKABLE_LOG_DIR}/{VECTOR_LOG_DIR} && \ + inotifywait -qq --event create {STACKABLE_LOG_DIR}/{VECTOR_LOG_DIR}; \ +fi +sleep 1 +kill $vector_pid +" )]) .add_env_var("VECTOR_LOG", log_level.to_vector_literal()) .add_volume_mount(config_volume_name, STACKABLE_CONFIG_DIR) @@ -1099,7 +1149,8 @@ kill $vector_pid" .build() } -/// Command to shut down the Vector instance +/// Command to create a shutdown file for the vector container. +/// Please delete it before starting your application using [`remove_vector_shutdown_file_command`]. /// /// # Example /// @@ -1112,8 +1163,9 @@ kill $vector_pid" /// const STACKABLE_LOG_DIR: &str = "/stackable/log"; /// /// let args = vec![ -/// "echo Perform initialization tasks ...".into(), -/// product_logging::framework::shutdown_vector_command(STACKABLE_LOG_DIR), +/// product_logging::framework::remove_vector_shutdown_file_command(STACKABLE_LOG_DIR), +/// "echo Perform some tasks ...".into(), +/// product_logging::framework::create_vector_shutdown_file_command(STACKABLE_LOG_DIR), /// ]; /// /// let container = ContainerBuilder::new("init") @@ -1124,13 +1176,19 @@ kill $vector_pid" /// .add_volume_mount("log", STACKABLE_LOG_DIR) /// .build(); /// ``` -pub fn shutdown_vector_command(stackable_log_dir: &str) -> String { +pub fn create_vector_shutdown_file_command(stackable_log_dir: &str) -> String { format!( "mkdir -p {stackable_log_dir}/{VECTOR_LOG_DIR} && \ touch {stackable_log_dir}/{VECTOR_LOG_DIR}/{SHUTDOWN_FILE}" ) } +/// Use this command to remove the shutdown file (if it exists) created by [`create_vector_shutdown_file_command`]. +/// You should execute this command before starting your application. +pub fn remove_vector_shutdown_file_command(stackable_log_dir: &str) -> String { + format!("rm -f {stackable_log_dir}/{VECTOR_LOG_DIR}/{SHUTDOWN_FILE}") +} + #[cfg(test)] mod tests { use super::*; diff --git a/src/utils.rs b/src/utils.rs index a41a61f2b..9d041bcea 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -1,5 +1,50 @@ use tracing::info; +/// This is a bash snippet, which adds two functions out of interest: +/// +/// 1. `prepare_signal_handlers` call this first to set up the needed traps +/// 2. `wait_for_termination` waits for the PID you passed as the first argument to terminate +/// +/// An example use could be +/// ```text +/// {COMMON_BASH_TRAP_FUNCTIONS} +/// echo "Run before startup" +/// prepare_signal_handlers +/// {hadoop_home}/bin/hdfs {role} & +/// wait_for_termination $! +/// echo "Run after termination" +/// ``` +pub const COMMON_BASH_TRAP_FUNCTIONS: &str = r#" +prepare_signal_handlers() +{ + unset term_child_pid + unset term_kill_needed + trap 'handle_term_signal' TERM +} + +handle_term_signal() +{ + if [ "${term_child_pid}" ]; then + kill -TERM "${term_child_pid}" 2>/dev/null + else + term_kill_needed="yes" + fi +} + +wait_for_termination() +{ + set +e + term_child_pid=$1 + if [[ -v term_kill_needed ]]; then + kill -TERM "${term_child_pid}" 2>/dev/null + fi + wait ${term_child_pid} 2>/dev/null + trap - TERM + wait ${term_child_pid} 2>/dev/null + set -e +} +"#; + /// Prints helpful and standardized diagnostic messages. /// /// This method is meant to be called first thing in the `main` method of an Operator.