From 12be3a64b7b2309c652d0d0162e9e791563ea76d Mon Sep 17 00:00:00 2001 From: Sebastian Bernauer Date: Fri, 15 Sep 2023 16:43:39 +0200 Subject: [PATCH 01/17] Add ADR on Pod disruptions --- .../adr/ADR030-reduce-pod-disruptions.adoc | 116 ++++++++++++++++++ 1 file changed, 116 insertions(+) create mode 100644 modules/contributor/pages/adr/ADR030-reduce-pod-disruptions.adoc diff --git a/modules/contributor/pages/adr/ADR030-reduce-pod-disruptions.adoc b/modules/contributor/pages/adr/ADR030-reduce-pod-disruptions.adoc new file mode 100644 index 000000000..fd0289f7f --- /dev/null +++ b/modules/contributor/pages/adr/ADR030-reduce-pod-disruptions.adoc @@ -0,0 +1,116 @@ += ADR032: Reduce Pod disruptions +Razvan Mihai +v0.1, 2023-09-15 +:status: accepted + +* Status: {status} +* Deciders: +** Sebastian Bernauer +* Date: 2023-09-15 + +== Context and Problem Statement + +Downtime of products is always bad. +Kubernetes has a multiple concepts to try to reduce this to a minimum. + +== Considered Options + +1. Use PodDisruptionBudget + +=== 1. Use PodDisruptionBudget + +1. We must deploy a https://kubernetes.io/docs/tasks/run-application/configure-pdb/[PodDisruptionBudgets] alongside all the StatefulSets (and Deployments in the future) to restrict pod disruptions. +2. Also users need to ability to override the numbers we default to, as they need to make a tradeoff between availability and rollout times e.g. in rolling redeployment. Context: I have operated Trino clusters that could take more than 6 hours to rolling redeploy, as the graceful shutdown of Trino workers takes a considerable amount of time - depended on the queries getting executed. + +We have the following constraints: + +1. If we use https://kubernetes.io/docs/tasks/run-application/configure-pdb/#arbitrary-controllers-and-selectors[arbitrary workloads and arbitrary selectors} we have the following constraints: + * only `.spec.minAvailable` can be used, not `.spec.maxUnavailable`. + * only an integer value can be used with `.spec.minAvailable`, not a percentage. +2. You can use a selector which selects a subset or superset of the pods belonging to a workload resource. The eviction API will disallow eviction of any pod covered by multiple PDBs, so most users will want to avoid overlapping selectors + +Because of the mentioned constraints we have the following implications: + +1. Use `.spec.maxUnavailable` everywhere +2. Have `.spec.maxUnavailable` configurable on the product CRD. +3. Create PodDisruptionBudget over the role and not over the rolegroups, as e.g. the Zookeeper quorum does not care about rolegroups. As of the docs we can not add a PDB for the role and the rolegroup at the same time. +4. Users must be able to disable our PDB creation in the case they want to define their own, as otherwise the Pods would have multiple PDBs, which is not supported. +5. We try to have a PDB per role, as this makes things much easier than e.g. saying "out of the namenodes and journalnodes only one can be down". Otherwise we can not make it "simply" configurable on the role. + +Taking the implications into account we end up with the following CRD structure: + +[source,yaml] +---- +apiVersion: hdfs.stackable.tech/v1alpha1 +kind: HdfsCluster +metadata: + name: simple-hdfs +spec: + image: + productVersion: 3.3.4 + clusterConfig: + zookeeperConfigMapName: simple-hdfs-znode + nameNodes: + # optional, only supported on role, *not* on rolegroup + pdb: + enabled: true # optional, defaults to true + maxUnavailable: 1 # optional, defaults to our "smart" calculation + roleGroups: + default: + replicas: 2 + dataNodes: + pdb: + enabled: true + maxUnavailable: 1 + roleGroups: + default: + replicas: 1 + journalNodes: + pdb: + enabled: true + maxUnavailable: 1 + roleGroups: + default: + replicas: 10 +---- + +and end up with the following PDBs when the default values are used: + +[source,yaml] +---- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: simple-hdfs-journalnodes +spec: + maxUnavailable: 1 + selector: + matchLabels: + app.kubernetes.io/name: hdfs + app.kubernetes.io/instance: simple-hdfs + app.kubernetes.io/component: journalnode +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: simple-hdfs-namenodes +spec: + maxUnavailable: 1 + selector: + matchLabels: + app.kubernetes.io/name: hdfs + app.kubernetes.io/instance: simple-hdfs + app.kubernetes.io/component: namenode +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: simple-hdfs-datanodes +spec: + maxUnavailable: 2 + selector: + matchLabels: + app.kubernetes.io/name: hdfs + app.kubernetes.io/instance: simple-hdfs + app.kubernetes.io/component: datanode +---- From 88653267af49be0731228d9cd2859888ab127b80 Mon Sep 17 00:00:00 2001 From: Sebastian Bernauer Date: Fri, 15 Sep 2023 16:47:26 +0200 Subject: [PATCH 02/17] WIP --- .../adr/ADR030-reduce-pod-disruptions.adoc | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/modules/contributor/pages/adr/ADR030-reduce-pod-disruptions.adoc b/modules/contributor/pages/adr/ADR030-reduce-pod-disruptions.adoc index fd0289f7f..2eecb78b2 100644 --- a/modules/contributor/pages/adr/ADR030-reduce-pod-disruptions.adoc +++ b/modules/contributor/pages/adr/ADR030-reduce-pod-disruptions.adoc @@ -11,15 +11,11 @@ v0.1, 2023-09-15 == Context and Problem Statement Downtime of products is always bad. -Kubernetes has a multiple concepts to try to reduce this to a minimum. +Kubernetes has a a concepts called https://kubernetes.io/docs/tasks/run-application/configure-pdb/[PodDisruptionBudget] (PDB) to try to reduce this to an absolute minimum. -== Considered Options +*Requirements:* -1. Use PodDisruptionBudget - -=== 1. Use PodDisruptionBudget - -1. We must deploy a https://kubernetes.io/docs/tasks/run-application/configure-pdb/[PodDisruptionBudgets] alongside all the StatefulSets (and Deployments in the future) to restrict pod disruptions. +1. We must deploy a PDB alongside all the StatefulSets (and Deployments in the future) to restrict pod disruptions. 2. Also users need to ability to override the numbers we default to, as they need to make a tradeoff between availability and rollout times e.g. in rolling redeployment. Context: I have operated Trino clusters that could take more than 6 hours to rolling redeploy, as the graceful shutdown of Trino workers takes a considerable amount of time - depended on the queries getting executed. We have the following constraints: @@ -59,16 +55,12 @@ spec: default: replicas: 2 dataNodes: - pdb: - enabled: true - maxUnavailable: 1 + # use pdb defaults roleGroups: default: replicas: 1 journalNodes: - pdb: - enabled: true - maxUnavailable: 1 + # use pdb defaults roleGroups: default: replicas: 10 From e0b60429c316d4ab5ab84b7fbf79f35709304567 Mon Sep 17 00:00:00 2001 From: Sebastian Bernauer Date: Fri, 15 Sep 2023 16:48:20 +0200 Subject: [PATCH 03/17] docs --- .../contributor/pages/adr/ADR030-reduce-pod-disruptions.adoc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/contributor/pages/adr/ADR030-reduce-pod-disruptions.adoc b/modules/contributor/pages/adr/ADR030-reduce-pod-disruptions.adoc index 2eecb78b2..2fdb36200 100644 --- a/modules/contributor/pages/adr/ADR030-reduce-pod-disruptions.adoc +++ b/modules/contributor/pages/adr/ADR030-reduce-pod-disruptions.adoc @@ -1,5 +1,5 @@ = ADR032: Reduce Pod disruptions -Razvan Mihai +Sebastian Bernauer v0.1, 2023-09-15 :status: accepted From d54625648a5482973ac2a6ffe952b632f67f4ce0 Mon Sep 17 00:00:00 2001 From: Sebastian Bernauer Date: Fri, 15 Sep 2023 16:53:14 +0200 Subject: [PATCH 04/17] WIP --- .../contributor/pages/adr/ADR030-reduce-pod-disruptions.adoc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/contributor/pages/adr/ADR030-reduce-pod-disruptions.adoc b/modules/contributor/pages/adr/ADR030-reduce-pod-disruptions.adoc index 2fdb36200..ad5c9e693 100644 --- a/modules/contributor/pages/adr/ADR030-reduce-pod-disruptions.adoc +++ b/modules/contributor/pages/adr/ADR030-reduce-pod-disruptions.adoc @@ -1,4 +1,4 @@ -= ADR032: Reduce Pod disruptions += ADR030: Reduce Pod disruptions Sebastian Bernauer v0.1, 2023-09-15 :status: accepted From a20dfa16751b1355f5eba9a9b7856fbae82d76ce Mon Sep 17 00:00:00 2001 From: Sebastian Bernauer Date: Fri, 15 Sep 2023 16:53:54 +0200 Subject: [PATCH 05/17] WIP --- .../contributor/pages/adr/ADR030-reduce-pod-disruptions.adoc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/contributor/pages/adr/ADR030-reduce-pod-disruptions.adoc b/modules/contributor/pages/adr/ADR030-reduce-pod-disruptions.adoc index ad5c9e693..6b96bc267 100644 --- a/modules/contributor/pages/adr/ADR030-reduce-pod-disruptions.adoc +++ b/modules/contributor/pages/adr/ADR030-reduce-pod-disruptions.adoc @@ -15,7 +15,7 @@ Kubernetes has a a concepts called https://kubernetes.io/docs/tasks/run-applicat *Requirements:* -1. We must deploy a PDB alongside all the StatefulSets (and Deployments in the future) to restrict pod disruptions. +1. We must deploy a PDB alongside all the product StatefulSets (and Deployments in the future) to restrict pod disruptions. 2. Also users need to ability to override the numbers we default to, as they need to make a tradeoff between availability and rollout times e.g. in rolling redeployment. Context: I have operated Trino clusters that could take more than 6 hours to rolling redeploy, as the graceful shutdown of Trino workers takes a considerable amount of time - depended on the queries getting executed. We have the following constraints: From 01d49ee4724d63dae51c79b7bbd5ebd8fdc81f81 Mon Sep 17 00:00:00 2001 From: Sebastian Bernauer Date: Fri, 15 Sep 2023 16:54:18 +0200 Subject: [PATCH 06/17] typo --- .../contributor/pages/adr/ADR030-reduce-pod-disruptions.adoc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/contributor/pages/adr/ADR030-reduce-pod-disruptions.adoc b/modules/contributor/pages/adr/ADR030-reduce-pod-disruptions.adoc index 6b96bc267..c8cdd294c 100644 --- a/modules/contributor/pages/adr/ADR030-reduce-pod-disruptions.adoc +++ b/modules/contributor/pages/adr/ADR030-reduce-pod-disruptions.adoc @@ -20,7 +20,7 @@ Kubernetes has a a concepts called https://kubernetes.io/docs/tasks/run-applicat We have the following constraints: -1. If we use https://kubernetes.io/docs/tasks/run-application/configure-pdb/#arbitrary-controllers-and-selectors[arbitrary workloads and arbitrary selectors} we have the following constraints: +1. If we use https://kubernetes.io/docs/tasks/run-application/configure-pdb/#arbitrary-controllers-and-selectors[arbitrary workloads and arbitrary selectors] we have the following constraints: * only `.spec.minAvailable` can be used, not `.spec.maxUnavailable`. * only an integer value can be used with `.spec.minAvailable`, not a percentage. 2. You can use a selector which selects a subset or superset of the pods belonging to a workload resource. The eviction API will disallow eviction of any pod covered by multiple PDBs, so most users will want to avoid overlapping selectors From cb08ab3f75f6a824f5b3425ea57b0eb803d40fcb Mon Sep 17 00:00:00 2001 From: Sebastian Bernauer Date: Fri, 15 Sep 2023 16:57:34 +0200 Subject: [PATCH 07/17] todo --- .../contributor/pages/adr/ADR030-reduce-pod-disruptions.adoc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modules/contributor/pages/adr/ADR030-reduce-pod-disruptions.adoc b/modules/contributor/pages/adr/ADR030-reduce-pod-disruptions.adoc index c8cdd294c..aa691aef8 100644 --- a/modules/contributor/pages/adr/ADR030-reduce-pod-disruptions.adoc +++ b/modules/contributor/pages/adr/ADR030-reduce-pod-disruptions.adoc @@ -8,6 +8,8 @@ v0.1, 2023-09-15 ** Sebastian Bernauer * Date: 2023-09-15 +TODO (sbernauer): Document setting minUnavailable to 0 and doing node drainage manually . + == Context and Problem Statement Downtime of products is always bad. From 2efcd8cfa16ca465a90ee9c529e60127a1afae9c Mon Sep 17 00:00:00 2001 From: Sebastian Bernauer Date: Tue, 19 Sep 2023 12:46:26 +0200 Subject: [PATCH 08/17] Update ADR030-reduce-pod-disruptions.adoc --- .../contributor/pages/adr/ADR030-reduce-pod-disruptions.adoc | 2 -- 1 file changed, 2 deletions(-) diff --git a/modules/contributor/pages/adr/ADR030-reduce-pod-disruptions.adoc b/modules/contributor/pages/adr/ADR030-reduce-pod-disruptions.adoc index aa691aef8..c8cdd294c 100644 --- a/modules/contributor/pages/adr/ADR030-reduce-pod-disruptions.adoc +++ b/modules/contributor/pages/adr/ADR030-reduce-pod-disruptions.adoc @@ -8,8 +8,6 @@ v0.1, 2023-09-15 ** Sebastian Bernauer * Date: 2023-09-15 -TODO (sbernauer): Document setting minUnavailable to 0 and doing node drainage manually . - == Context and Problem Statement Downtime of products is always bad. From 4c0e74e0704f4018d8c548dc65b00598462136ca Mon Sep 17 00:00:00 2001 From: Sebastian Bernauer Date: Tue, 19 Sep 2023 13:42:45 +0200 Subject: [PATCH 09/17] fixes --- .../contributor/pages/adr/ADR030-reduce-pod-disruptions.adoc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/modules/contributor/pages/adr/ADR030-reduce-pod-disruptions.adoc b/modules/contributor/pages/adr/ADR030-reduce-pod-disruptions.adoc index c8cdd294c..eb5c9655e 100644 --- a/modules/contributor/pages/adr/ADR030-reduce-pod-disruptions.adoc +++ b/modules/contributor/pages/adr/ADR030-reduce-pod-disruptions.adoc @@ -11,12 +11,13 @@ v0.1, 2023-09-15 == Context and Problem Statement Downtime of products is always bad. -Kubernetes has a a concepts called https://kubernetes.io/docs/tasks/run-application/configure-pdb/[PodDisruptionBudget] (PDB) to try to reduce this to an absolute minimum. +Kubernetes has a a concepts called https://kubernetes.io/docs/tasks/run-application/configure-pdb/[PodDisruptionBudget] (PDB) to prevent this. +We want to use this functionary to try to reduce the downtime to an absolute minimum. *Requirements:* 1. We must deploy a PDB alongside all the product StatefulSets (and Deployments in the future) to restrict pod disruptions. -2. Also users need to ability to override the numbers we default to, as they need to make a tradeoff between availability and rollout times e.g. in rolling redeployment. Context: I have operated Trino clusters that could take more than 6 hours to rolling redeploy, as the graceful shutdown of Trino workers takes a considerable amount of time - depended on the queries getting executed. +2. Also users need the ability to override the numbers we default to, as they need to make a tradeoff between availability and rollout times e.g. in rolling redeployment. Context: I have operated Trino clusters that could take more than 6 hours to rolling redeploy, as the graceful shutdown of Trino workers takes a considerable amount of time - depended on the queries getting executed. We have the following constraints: From 713b5eaec095eeec04852317020fed9657475222 Mon Sep 17 00:00:00 2001 From: Sebastian Bernauer Date: Tue, 19 Sep 2023 13:43:47 +0200 Subject: [PATCH 10/17] Add to nav --- modules/contributor/partials/current_adrs.adoc | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/contributor/partials/current_adrs.adoc b/modules/contributor/partials/current_adrs.adoc index 744ff8550..1e9a73ae0 100644 --- a/modules/contributor/partials/current_adrs.adoc +++ b/modules/contributor/partials/current_adrs.adoc @@ -26,3 +26,4 @@ **** xref:adr/ADR027-status.adoc[] **** xref:adr/ADR028-automatic-stackable-version.adoc[] **** xref:adr/ADR029-database-connection.adoc[] +**** xref:adr/ADR030-reduce-pod-disruptions.adoc[] From 4d5de777b00f367b3927e7f8114f55030678b4a5 Mon Sep 17 00:00:00 2001 From: Sebastian Bernauer Date: Thu, 21 Sep 2023 10:32:38 +0200 Subject: [PATCH 11/17] WIP --- .../adr/ADR030-reduce-pod-disruptions.adoc | 306 +++++++++++++++++- 1 file changed, 296 insertions(+), 10 deletions(-) diff --git a/modules/contributor/pages/adr/ADR030-reduce-pod-disruptions.adoc b/modules/contributor/pages/adr/ADR030-reduce-pod-disruptions.adoc index eb5c9655e..cb915e4b8 100644 --- a/modules/contributor/pages/adr/ADR030-reduce-pod-disruptions.adoc +++ b/modules/contributor/pages/adr/ADR030-reduce-pod-disruptions.adoc @@ -14,7 +14,23 @@ Downtime of products is always bad. Kubernetes has a a concepts called https://kubernetes.io/docs/tasks/run-application/configure-pdb/[PodDisruptionBudget] (PDB) to prevent this. We want to use this functionary to try to reduce the downtime to an absolute minimum. -*Requirements:* +== Decision Drivers + +* Ease of use and comprehensibility for the user +** Principle of least surprise +* Easy implementation (far less important) +* Extendable design, so that we can later non-breaking add new functionality, such as giving the chance to configure PDBs on roleGroup level as well. + +== Example use-cases + +1. As a user I want an HDFS and it (or parts) should not be disturbed by planned pod evictions. +2. As a user I want to configure maxUnavailable on the role (e.g. datanode) across all rolegroups (e.g. dfs replicas 3 and only a single datanode is allowed to go down - regardless of the number of rolegroups), so that no datanode is a single point of failure. +3. As a user I want to configure maxUnavailable on the rolegroups individually, as I e.g. have some fast datanodes using SSDs and some slow datanodes using HDDs. I want to have always X number of fast datanodes online for performance reasons. +4. As a user I want a Superset/NiFi/Kafka and they (or parts) should not be disturbed by planned pod evictions. + +Most of the users probably either don't know what PDBs are or are fine with the default values our operators deploy based upon our knowledge of the products. + +== Requirements 1. We must deploy a PDB alongside all the product StatefulSets (and Deployments in the future) to restrict pod disruptions. 2. Also users need the ability to override the numbers we default to, as they need to make a tradeoff between availability and rollout times e.g. in rolling redeployment. Context: I have operated Trino clusters that could take more than 6 hours to rolling redeploy, as the graceful shutdown of Trino workers takes a considerable amount of time - depended on the queries getting executed. @@ -34,8 +50,19 @@ Because of the mentioned constraints we have the following implications: 4. Users must be able to disable our PDB creation in the case they want to define their own, as otherwise the Pods would have multiple PDBs, which is not supported. 5. We try to have a PDB per role, as this makes things much easier than e.g. saying "out of the namenodes and journalnodes only one can be down". Otherwise we can not make it "simply" configurable on the role. -Taking the implications into account we end up with the following CRD structure: +== Question 1: Do we want to support configuring PDBs on role or role and rolegroup? + +=== Option 1: Configurable on role level +=== Option 2: Configurable on role + rolegroup level + +Cons: + +* It's really really complicated for the user and the implementation. + +.Explanation +[%collapsible] +==== [source,yaml] ---- apiVersion: hdfs.stackable.tech/v1alpha1 @@ -48,8 +75,80 @@ spec: clusterConfig: zookeeperConfigMapName: simple-hdfs-znode nameNodes: - # optional, only supported on role, *not* on rolegroup - pdb: + config: + podDisruptionBudget: + enabled: true + maxUnavailable: 2 + roleGroups: + hdd: + replicas: 16 + config: + podDisruptionBudget: + maxUnavailable: 4 + ssd: + replicas: 8 + config: + podDisruptionBudget: + enabled: false + in-memory: + replicas: 4 +---- + +would end up with something like + +[source,yaml] +---- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: simple-hdfs-datanodes-hdds +spec: + maxUnavailable: 4 + selector: + matchLabels: + app.kubernetes.io/name: hdfs + app.kubernetes.io/instance: simple-hdfs + app.kubernetes.io/component: datanode + app.kubernetes.io/rolegroup: hdd +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: simple-hdfs-datanodes-not-hdds +spec: + maxUnavailable: 2 + selector: + matchLabels: + app.kubernetes.io/name: hdfs + app.kubernetes.io/instance: simple-hdfs + app.kubernetes.io/component: datanode + matchExpressions: + - key: app.kubernetes.io/rolegroup + operator: NotIn + values: + - hdd + - key: app.kubernetes.io/rolegroup + operator: NotIn + values: + - in-memory +---- +==== + +Chosen option: *Option 1: Configurable on role level* + +== Question 2: How dows the CRD structure look like? + +=== Option 1 + +[source,yaml] +---- +apiVersion: hdfs.stackable.tech/v1alpha1 +kind: HdfsCluster +metadata: + name: simple-hdfs +spec: + nameNodes: + podDisruptionBudget: # optional enabled: true # optional, defaults to true maxUnavailable: 1 # optional, defaults to our "smart" calculation roleGroups: @@ -59,15 +158,202 @@ spec: # use pdb defaults roleGroups: default: - replicas: 1 - journalNodes: - # use pdb defaults + replicas: 2 +---- + +==== Pros + +* Everything below `config` can be merged, everything below `clusterConfig` has applied to the whole cluster (no exceptions) + +==== Cons + +* Bloating `spec.namenodes` + +=== Option 2 + +[source,yaml] +---- +spec: + nameNodes: + config: # <<< + podDisruptionBudget: + enabled: true + maxUnavailable: 1 roleGroups: default: - replicas: 10 + replicas: 2 + config: {} + # no such field as podDisruptionBudget ---- -and end up with the following PDBs when the default values are used: +==== Pros + +* Everything configurable is below `config` - some attributes of it can be merged - or `clusterConfig`. + +==== Cons + +* `spec.nameNodes.config` is *not* similar to `spec.nameNodes.roleGroups.default.config` => Confusing to the user + +=== Option 3 + +[source,yaml] +---- +spec: + nameNodes: + roleConfig: # <<< + podDisruptionBudget: + enabled: true + maxUnavailable: 1 + roleGroups: + default: + replicas: 2 +---- + +==== Pros + +* Not bloating `spec.namenodes` + +==== Cons + +* Yet another "config" (config, clusterConfig and now roleConfig as well) +** That's kind of the way the real world is: There are some thing you can configure on cluster level (e.g. ldap), role level (pdbs) and role group level (resources). This models this the closest. + +=== Option 4 + +[source,yaml] +---- +spec: + dataNodes: + config: + podDisruptionBudget: + maxUnavailable: 2 + roleGroups: + hdd: + replicas: 16 + ssd: + replicas: 8 + in-memory: + replicas: 4 +---- + +would end up with + +[source,yaml] +---- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: simple-hdfs-datanodes-hdds +spec: + maxUnavailable: 2 + selector: + matchLabels: + app.kubernetes.io/name: hdfs + app.kubernetes.io/instance: simple-hdfs + app.kubernetes.io/component: datanode + app.kubernetes.io/rolegroup: hdd +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: simple-hdfs-datanodes-hdds +spec: + maxUnavailable: 2 + selector: + matchLabels: + app.kubernetes.io/name: hdfs + app.kubernetes.io/instance: simple-hdfs + app.kubernetes.io/component: datanode + app.kubernetes.io/rolegroup: ssd +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: simple-hdfs-datanodes-hdds +spec: + maxUnavailable: 2 + selector: + matchLabels: + app.kubernetes.io/name: hdfs + app.kubernetes.io/instance: simple-hdfs + app.kubernetes.io/component: datanode + app.kubernetes.io/rolegroup: in-memory +---- + +[source,yaml] +---- +spec: + nameNodes: + config: + podDisruptionBudget: + enabled: true + maxUnavailable: 2 + roleGroups: + hdd: + replicas: 16 + config: + podDisruptionBudget: + maxUnavailable: 4 + ssd: + replicas: 8 + config: + podDisruptionBudget: + enabled: false + in-memory: + replicas: 4 +---- + +would end up with + +[source,yaml] +---- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: simple-hdfs-datanodes-hdds +spec: + maxUnavailable: 4 + selector: + matchLabels: + app.kubernetes.io/name: hdfs + app.kubernetes.io/instance: simple-hdfs + app.kubernetes.io/component: datanode + app.kubernetes.io/rolegroup: hdd +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: simple-hdfs-datanodes-hdds +spec: + maxUnavailable: 2 + selector: + matchLabels: + app.kubernetes.io/name: hdfs + app.kubernetes.io/instance: simple-hdfs + app.kubernetes.io/component: datanode + app.kubernetes.io/rolegroup: in-memory +---- + + + +==== Pros + +* + +==== Cons + +* + + + + + + + + + + +We end up with the following PDBs when the default values are used: [source,yaml] ---- @@ -100,7 +386,7 @@ kind: PodDisruptionBudget metadata: name: simple-hdfs-datanodes spec: - maxUnavailable: 2 + maxUnavailable: 2 # assuming dfs replication 3 selector: matchLabels: app.kubernetes.io/name: hdfs From ff692c73d544e45a0a0ad56d1659f0606e668f50 Mon Sep 17 00:00:00 2001 From: Sebastian Bernauer Date: Thu, 21 Sep 2023 12:53:37 +0200 Subject: [PATCH 12/17] WIP --- modules/contributor/pages/adr/ADR030-reduce-pod-disruptions.adoc | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/contributor/pages/adr/ADR030-reduce-pod-disruptions.adoc b/modules/contributor/pages/adr/ADR030-reduce-pod-disruptions.adoc index cb915e4b8..c4dc98eac 100644 --- a/modules/contributor/pages/adr/ADR030-reduce-pod-disruptions.adoc +++ b/modules/contributor/pages/adr/ADR030-reduce-pod-disruptions.adoc @@ -217,6 +217,7 @@ spec: * Yet another "config" (config, clusterConfig and now roleConfig as well) ** That's kind of the way the real world is: There are some thing you can configure on cluster level (e.g. ldap), role level (pdbs) and role group level (resources). This models this the closest. +* Its not possible to define PDBs on rolegroups without the user deploying it's own PDBs. === Option 4 From 04d4247cb3f20d2cc533e4a2cc982670c38070e5 Mon Sep 17 00:00:00 2001 From: Felix Hennig Date: Fri, 22 Sep 2023 10:38:53 +0200 Subject: [PATCH 13/17] updated ADR --- .../adr/ADR030-reduce-pod-disruptions.adoc | 283 ++++++++---------- 1 file changed, 117 insertions(+), 166 deletions(-) diff --git a/modules/contributor/pages/adr/ADR030-reduce-pod-disruptions.adoc b/modules/contributor/pages/adr/ADR030-reduce-pod-disruptions.adoc index c4dc98eac..6e10e230e 100644 --- a/modules/contributor/pages/adr/ADR030-reduce-pod-disruptions.adoc +++ b/modules/contributor/pages/adr/ADR030-reduce-pod-disruptions.adoc @@ -8,59 +8,127 @@ v0.1, 2023-09-15 ** Sebastian Bernauer * Date: 2023-09-15 -== Context and Problem Statement +== Context and problem statement -Downtime of products is always bad. -Kubernetes has a a concepts called https://kubernetes.io/docs/tasks/run-application/configure-pdb/[PodDisruptionBudget] (PDB) to prevent this. -We want to use this functionary to try to reduce the downtime to an absolute minimum. +Downtime of products is always bad, but sometimes Pods need to be restarted to roll out updates or renew certificates. +To prevent services from becoming unavailable we need to make sure that there is always a certain number of Pods still online when restarting Pods. +Kubernetes has a concept called https://kubernetes.io/docs/tasks/run-application/configure-pdb/[PodDisruptionBudget] (PDB) to define the number of Pods +that need to be kept online or the number of Pods that can safely be taken offline. +We want to use this functionary to either prevent services outages entirely or try to keep them to a minimum. +PDBs are defined not on a StatefulSet or Deployment, but with a selector over labels, so they can also span Pods from multiple StatefulSets. -== Decision Drivers +=== Example use-cases -* Ease of use and comprehensibility for the user -** Principle of least surprise -* Easy implementation (far less important) +1. As a user I want an HDFS and it (or parts) should not be disturbed by planned pod evictions (for example for a certificate renewal). I expect this to be the default behaviour. +2. As a user I want to configure maxUnavailable on the role (e.g. datanode) across all rolegroups (e.g. dfs replicas 3 and only a single datanode is allowed to go down - regardless of the number of rolegroups), so that no datanode is a single point of failure. Similarly for ZooKeeper, I want to define PDBs at role level as ZK quorum is independent of role groups. +3. As a user I want to override defaults to maybe have less availability but faster rollout times in rolling redeployments; for example a Trino cluster that could take more than 6 hours to rolling redeploy, as the graceful shutdown of Trino workers takes a considerable amount of time - depended on the queries getting executed. +4. As a user I want to configure maxUnavailable on rolegroups individually, as I e.g. have some fast datanodes using SSDs and some slow datanodes using HDDs. I want to have always X number of fast datanodes online for performance reasons. +5. As a user I want a Superset/NiFi/Kafka and they (or parts) should not be disturbed by planned pod evictions. +6. As a user I might want to define PDBs across roles or on other specific Pod selections, in that case I want to be able to disable the Stackable generated PDBs. + +We expect the majority of users to either use default PDB settings or define PDBs at a role level. Role group configuration like in use-case 4 has merit but seems like a more niche usage scenario. + +=== Technical considerations + +We have the following constraints: + +If we use https://kubernetes.io/docs/tasks/run-application/configure-pdb/#arbitrary-controllers-and-selectors[arbitrary workloads and arbitrary selectors] (for example when selecting Pods from multiple StatefulSets) we have the following constraints: + * only `.spec.minAvailable` can be used, not `.spec.maxUnavailable`. + * only an integer value can be used with `.spec.minAvailable`, not a percentage. + +This means that if we select any Pods that are not part of a StatefulSet or Deployment etc. then we are bound by these constraints. Preliminary testing showed that `.spec.maxUnavailable` works with multiple StatefulSets. + +You can use a selector which selects a subset or superset of the pods belonging to a workload resource. The eviction API will disallow eviction of any pod covered by multiple PDBs, so most users will want to avoid overlapping selectors. + +We need to create PDBs in such a way that every Pod is only selected once. This is easiest if a selector is defined per role or for all role groups individually. Excluding certain labels is also possible my using match expressions, but we did not test whether is conflicts with the first constraint about arbitrary selectors. +To support the user creating their own custom PDBs we need to support disabling PDB generation to prevent overlapping selectors. + +== Decision drivers + +* Common use-cases should be easy to configure. +* Principle of least surprise: CRD configuration settings and their interactions in case of multiple settings need to be easy to comprehend to prevent user error. * Extendable design, so that we can later non-breaking add new functionality, such as giving the chance to configure PDBs on roleGroup level as well. +* Simple implementation (far less important) -== Example use-cases +== Decision outcome -1. As a user I want an HDFS and it (or parts) should not be disturbed by planned pod evictions. -2. As a user I want to configure maxUnavailable on the role (e.g. datanode) across all rolegroups (e.g. dfs replicas 3 and only a single datanode is allowed to go down - regardless of the number of rolegroups), so that no datanode is a single point of failure. -3. As a user I want to configure maxUnavailable on the rolegroups individually, as I e.g. have some fast datanodes using SSDs and some slow datanodes using HDDs. I want to have always X number of fast datanodes online for performance reasons. -4. As a user I want a Superset/NiFi/Kafka and they (or parts) should not be disturbed by planned pod evictions. +Option 1 was picked. -Most of the users probably either don't know what PDBs are or are fine with the default values our operators deploy based upon our knowledge of the products. +== Considered options -== Requirements +=== Option 1 -1. We must deploy a PDB alongside all the product StatefulSets (and Deployments in the future) to restrict pod disruptions. -2. Also users need the ability to override the numbers we default to, as they need to make a tradeoff between availability and rollout times e.g. in rolling redeployment. Context: I have operated Trino clusters that could take more than 6 hours to rolling redeploy, as the graceful shutdown of Trino workers takes a considerable amount of time - depended on the queries getting executed. +Introduce a new `roleConfig` at role level and put PDBs in there. Only role level PDBs are supported, for role group level the PDBs should be disabled and the user needs to create PDBs manually. The `roleConfig` is put in place to not put the PDB setting directly in the role. -We have the following constraints: +[source,yaml] +---- +spec: + nameNodes: + roleConfig: # <<< + podDisruptionBudget: # optional + enabled: true # optional, defaults to true + maxUnavailable: 1 # optional, defaults to our "smart" calculation + roleGroups: + default: + replicas: 2 + dataNodes: + # use pdb defaults + roleGroups: + default: + replicas: 2 +---- -1. If we use https://kubernetes.io/docs/tasks/run-application/configure-pdb/#arbitrary-controllers-and-selectors[arbitrary workloads and arbitrary selectors] we have the following constraints: - * only `.spec.minAvailable` can be used, not `.spec.maxUnavailable`. - * only an integer value can be used with `.spec.minAvailable`, not a percentage. -2. You can use a selector which selects a subset or superset of the pods belonging to a workload resource. The eviction API will disallow eviction of any pod covered by multiple PDBs, so most users will want to avoid overlapping selectors +==== Pros -Because of the mentioned constraints we have the following implications: +* simple to understand +* covers the majority of use cases +* still leaves the option to disable and roll your own -1. Use `.spec.maxUnavailable` everywhere -2. Have `.spec.maxUnavailable` configurable on the product CRD. -3. Create PodDisruptionBudget over the role and not over the rolegroups, as e.g. the Zookeeper quorum does not care about rolegroups. As of the docs we can not add a PDB for the role and the rolegroup at the same time. -4. Users must be able to disable our PDB creation in the case they want to define their own, as otherwise the Pods would have multiple PDBs, which is not supported. -5. We try to have a PDB per role, as this makes things much easier than e.g. saying "out of the namenodes and journalnodes only one can be down". Otherwise we can not make it "simply" configurable on the role. +==== Cons -== Question 1: Do we want to support configuring PDBs on role or role and rolegroup? +* Yet another "config" (config, clusterConfig and now roleConfig as well) +** That's kind of the way the real world is: There are some thing you can configure on cluster level (e.g. ldap), role level (pdbs) and role group level (resources). This models this the closest. +* Its not possible to define PDBs on rolegroups without the user deploying it's own PDBs. -=== Option 1: Configurable on role level +NOTE: In the discussion the option of having the PDB directly in the role without a `roleConfig` was briefly discussed but not considered as an option due to being too messy, so it is not listed as an explicit option here. -=== Option 2: Configurable on role + rolegroup level +=== Option 2a - PDB in `config`, but only at role level -Cons: +Instead of inventing a new `roleConfig` setting, put the PDB in the `config`. This might seem better at first, but usually settings in `config` can also be set at role group level, and in this case, that would not be true. -* It's really really complicated for the user and the implementation. +[source,yaml] +---- +spec: + nameNodes: + config: # <<< + podDisruptionBudget: + enabled: true + maxUnavailable: 1 + roleGroups: + default: + replicas: 2 + config: {} + # no such field as podDisruptionBudget +---- + +==== Pros + +* Everything configurable is below `config`, no new `roleConfig` +* Like Option 1, covers configuration of the most important use cases + +==== Cons -.Explanation +* `spec.nameNodes.config` is *not* similar to `spec.nameNodes.roleGroups.default.config` => Confusing to the user +** thinking more about it, it might be confusing that the setting is not "copied" to all role groups like other settings like resources or affinities. +* Still no option to configure role group level PDBs +* Possibly complicated to implement, due to `config` usually being identical at role and role group level + +=== Option 2b: PDB in config with elaborate merge mechanism + +Similar to Option 2a, the PDB setting is located in the `config` but it is actually possible to use it at both role and role group level. +We develop a semantic merge mechanism that would prevent overlapping PDBs. + +.CRD Example [%collapsible] ==== [source,yaml] @@ -134,93 +202,25 @@ spec: ---- ==== -Chosen option: *Option 1: Configurable on role level* - -== Question 2: How dows the CRD structure look like? - -=== Option 1 - -[source,yaml] ----- -apiVersion: hdfs.stackable.tech/v1alpha1 -kind: HdfsCluster -metadata: - name: simple-hdfs -spec: - nameNodes: - podDisruptionBudget: # optional - enabled: true # optional, defaults to true - maxUnavailable: 1 # optional, defaults to our "smart" calculation - roleGroups: - default: - replicas: 2 - dataNodes: - # use pdb defaults - roleGroups: - default: - replicas: 2 ----- - -==== Pros - -* Everything below `config` can be merged, everything below `clusterConfig` has applied to the whole cluster (no exceptions) - -==== Cons - -* Bloating `spec.namenodes` - -=== Option 2 - -[source,yaml] ----- -spec: - nameNodes: - config: # <<< - podDisruptionBudget: - enabled: true - maxUnavailable: 1 - roleGroups: - default: - replicas: 2 - config: {} - # no such field as podDisruptionBudget ----- - ==== Pros -* Everything configurable is below `config` - some attributes of it can be merged - or `clusterConfig`. +* Fits into the existing config structure +* Allows configuring role config level PDBs and even hybrid configs ==== Cons -* `spec.nameNodes.config` is *not* similar to `spec.nameNodes.roleGroups.default.config` => Confusing to the user +* Complex merge mechanism possibly difficult to understand and therefore easy to use the wrong way +* Complex mechanism also not trivial to implement -=== Option 3 +=== Option 2c - PDB in config with normal "shared role group config" behaviour -[source,yaml] ----- -spec: - nameNodes: - roleConfig: # <<< - podDisruptionBudget: - enabled: true - maxUnavailable: 1 - roleGroups: - default: - replicas: 2 ----- +Again we put the PDB in the `config` section but simply use the normal "copy" behaviour for this setting. +This would be simple and easy to understand, but does not allow for true role level PDBs. -==== Pros - -* Not bloating `spec.namenodes` - -==== Cons - -* Yet another "config" (config, clusterConfig and now roleConfig as well) -** That's kind of the way the real world is: There are some thing you can configure on cluster level (e.g. ldap), role level (pdbs) and role group level (resources). This models this the closest. -* Its not possible to define PDBs on rolegroups without the user deploying it's own PDBs. - -=== Option 4 +.CRD Example +[%collapsible] +==== [source,yaml] ---- spec: @@ -334,63 +334,14 @@ spec: app.kubernetes.io/component: datanode app.kubernetes.io/rolegroup: in-memory ---- - - +==== ==== Pros -* +* easy to understand +* easy to implement +* works the same as all other config ==== Cons -* - - - - - - - - - - -We end up with the following PDBs when the default values are used: - -[source,yaml] ----- -apiVersion: policy/v1 -kind: PodDisruptionBudget -metadata: - name: simple-hdfs-journalnodes -spec: - maxUnavailable: 1 - selector: - matchLabels: - app.kubernetes.io/name: hdfs - app.kubernetes.io/instance: simple-hdfs - app.kubernetes.io/component: journalnode ---- -apiVersion: policy/v1 -kind: PodDisruptionBudget -metadata: - name: simple-hdfs-namenodes -spec: - maxUnavailable: 1 - selector: - matchLabels: - app.kubernetes.io/name: hdfs - app.kubernetes.io/instance: simple-hdfs - app.kubernetes.io/component: namenode ---- -apiVersion: policy/v1 -kind: PodDisruptionBudget -metadata: - name: simple-hdfs-datanodes -spec: - maxUnavailable: 2 # assuming dfs replication 3 - selector: - matchLabels: - app.kubernetes.io/name: hdfs - app.kubernetes.io/instance: simple-hdfs - app.kubernetes.io/component: datanode ----- +* Does not support the common use case of role level PDBs From 1d4836db731761cc6ffcf5d485142d238e2aeb9f Mon Sep 17 00:00:00 2001 From: Felix Hennig Date: Fri, 22 Sep 2023 11:39:15 +0200 Subject: [PATCH 14/17] renamed options --- .../pages/adr/ADR030-reduce-pod-disruptions.adoc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/modules/contributor/pages/adr/ADR030-reduce-pod-disruptions.adoc b/modules/contributor/pages/adr/ADR030-reduce-pod-disruptions.adoc index 6e10e230e..8c214602b 100644 --- a/modules/contributor/pages/adr/ADR030-reduce-pod-disruptions.adoc +++ b/modules/contributor/pages/adr/ADR030-reduce-pod-disruptions.adoc @@ -92,7 +92,7 @@ spec: NOTE: In the discussion the option of having the PDB directly in the role without a `roleConfig` was briefly discussed but not considered as an option due to being too messy, so it is not listed as an explicit option here. -=== Option 2a - PDB in `config`, but only at role level +=== Option 2 - PDB in `config`, but only at role level Instead of inventing a new `roleConfig` setting, put the PDB in the `config`. This might seem better at first, but usually settings in `config` can also be set at role group level, and in this case, that would not be true. @@ -123,9 +123,9 @@ spec: * Still no option to configure role group level PDBs * Possibly complicated to implement, due to `config` usually being identical at role and role group level -=== Option 2b: PDB in config with elaborate merge mechanism +=== Option 3: PDB in config with elaborate merge mechanism -Similar to Option 2a, the PDB setting is located in the `config` but it is actually possible to use it at both role and role group level. +Similar to Option 2, the PDB setting is located in the `config` but it is actually possible to use it at both role and role group level. We develop a semantic merge mechanism that would prevent overlapping PDBs. .CRD Example @@ -212,7 +212,7 @@ spec: * Complex merge mechanism possibly difficult to understand and therefore easy to use the wrong way * Complex mechanism also not trivial to implement -=== Option 2c - PDB in config with normal "shared role group config" behaviour +=== Option 4 - PDB in config with normal "shared role group config" behaviour Again we put the PDB in the `config` section but simply use the normal "copy" behaviour for this setting. This would be simple and easy to understand, but does not allow for true role level PDBs. From 25dba57e33813138485bc5c629eb708aa057d28f Mon Sep 17 00:00:00 2001 From: Sebastian Bernauer Date: Fri, 22 Sep 2023 11:39:49 +0200 Subject: [PATCH 15/17] Update modules/contributor/pages/adr/ADR030-reduce-pod-disruptions.adoc --- .../contributor/pages/adr/ADR030-reduce-pod-disruptions.adoc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/modules/contributor/pages/adr/ADR030-reduce-pod-disruptions.adoc b/modules/contributor/pages/adr/ADR030-reduce-pod-disruptions.adoc index 8c214602b..040937eef 100644 --- a/modules/contributor/pages/adr/ADR030-reduce-pod-disruptions.adoc +++ b/modules/contributor/pages/adr/ADR030-reduce-pod-disruptions.adoc @@ -5,7 +5,11 @@ v0.1, 2023-09-15 * Status: {status} * Deciders: +** Felix Hennig +** Lars Francke +** Sascha Lautenschläger ** Sebastian Bernauer +** Sönke Liebau * Date: 2023-09-15 == Context and problem statement From 8474bf8dd5827c37c20ded10fd0e8ffe2045f10b Mon Sep 17 00:00:00 2001 From: Sebastian Bernauer Date: Mon, 25 Sep 2023 11:32:10 +0200 Subject: [PATCH 16/17] Rename to "Allowed Pod disruptions" --- ...d-disruptions.adoc => ADR030-allowed-pod-disruptions.adoc} | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) rename modules/contributor/pages/adr/{ADR030-reduce-pod-disruptions.adoc => ADR030-allowed-pod-disruptions.adoc} (99%) diff --git a/modules/contributor/pages/adr/ADR030-reduce-pod-disruptions.adoc b/modules/contributor/pages/adr/ADR030-allowed-pod-disruptions.adoc similarity index 99% rename from modules/contributor/pages/adr/ADR030-reduce-pod-disruptions.adoc rename to modules/contributor/pages/adr/ADR030-allowed-pod-disruptions.adoc index 040937eef..f7b1d77e2 100644 --- a/modules/contributor/pages/adr/ADR030-reduce-pod-disruptions.adoc +++ b/modules/contributor/pages/adr/ADR030-allowed-pod-disruptions.adoc @@ -1,4 +1,4 @@ -= ADR030: Reduce Pod disruptions += ADR030: Allowed Pod disruptions Sebastian Bernauer v0.1, 2023-09-15 :status: accepted @@ -214,7 +214,7 @@ spec: ==== Cons * Complex merge mechanism possibly difficult to understand and therefore easy to use the wrong way -* Complex mechanism also not trivial to implement +* Complex mechanism also not trivial to implement === Option 4 - PDB in config with normal "shared role group config" behaviour From ca7abd3d0f26ce5797f698461edcdcd54ecd1005 Mon Sep 17 00:00:00 2001 From: Sebastian Bernauer Date: Fri, 29 Sep 2023 10:34:01 +0200 Subject: [PATCH 17/17] Link from concepts apge to ADR --- modules/concepts/pages/operations/pod_disruptions.adoc | 2 +- modules/contributor/partials/current_adrs.adoc | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/concepts/pages/operations/pod_disruptions.adoc b/modules/concepts/pages/operations/pod_disruptions.adoc index 89a71bf7f..86d912185 100644 --- a/modules/concepts/pages/operations/pod_disruptions.adoc +++ b/modules/concepts/pages/operations/pod_disruptions.adoc @@ -91,4 +91,4 @@ spec: This PDB allows only one Pod out of all the Namenodes and Journalnodes to be down at one time. == Details -Have a look at <<< TODO: link ADR on Pod Disruptions once merged >>> for the implementation details. +Have a look at the xref:contributor:adr/ADR030-allowed-pod-disruptions.adoc[ADR on Allowed Pod disruptions] for the implementation details. diff --git a/modules/contributor/partials/current_adrs.adoc b/modules/contributor/partials/current_adrs.adoc index 1e9a73ae0..a73d33bc4 100644 --- a/modules/contributor/partials/current_adrs.adoc +++ b/modules/contributor/partials/current_adrs.adoc @@ -26,4 +26,4 @@ **** xref:adr/ADR027-status.adoc[] **** xref:adr/ADR028-automatic-stackable-version.adoc[] **** xref:adr/ADR029-database-connection.adoc[] -**** xref:adr/ADR030-reduce-pod-disruptions.adoc[] +**** xref:adr/ADR030-allowed-pod-disruptions.adoc[]