Skip to content

Commit 25ccc87

Browse files
authored
sync all resources to cluster fields (#2713)
* sync all resources to cluster fields (CronJob, Streams, Patroni resources) * separated sync and delete logic for Patroni resources * align delete streams and secrets logic with other resources * rename gatherApplicationIds to getDistinctApplicationIds * improve slot check before syncing streams CRD * add ownerReferences and annotations diff to Patroni objects * add extra sync code for config service so it does not get too ugly * some bugfixes when comparing annotations and return err on found * sync Patroni resources on update event and extended unit tests * add config service/endpoint owner references check to e2e tes
1 parent 31f92a1 commit 25ccc87

File tree

11 files changed

+666
-302
lines changed

11 files changed

+666
-302
lines changed

docs/administrator.md

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -252,17 +252,16 @@ will differ and trigger a rolling update of the pods.
252252
## Owner References and Finalizers
253253

254254
The Postgres Operator can set [owner references](https://kubernetes.io/docs/concepts/overview/working-with-objects/owners-dependents/) to most of a cluster's child resources to improve
255-
monitoring with GitOps tools and enable cascading deletes. There are three
255+
monitoring with GitOps tools and enable cascading deletes. There are two
256256
exceptions:
257257

258258
* Persistent Volume Claims, because they are handled by the [PV Reclaim Policy]https://kubernetes.io/docs/tasks/administer-cluster/change-pv-reclaim-policy/ of the Stateful Set
259-
* The config endpoint + headless service resource because it is managed by Patroni
260259
* Cross-namespace secrets, because owner references are not allowed across namespaces by design
261260

262261
The operator would clean these resources up with its regular delete loop
263262
unless they got synced correctly. If for some reason the initial cluster sync
264263
fails, e.g. after a cluster creation or operator restart, a deletion of the
265-
cluster manifest would leave orphaned resources behind which the user has to
264+
cluster manifest might leave orphaned resources behind which the user has to
266265
clean up manually.
267266

268267
Another option is to enable finalizers which first ensures the deletion of all

e2e/tests/test_e2e.py

Lines changed: 16 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -402,8 +402,8 @@ def test_config_update(self):
402402
"max_connections": new_max_connections_value,
403403
"wal_level": "logical"
404404
}
405-
},
406-
"patroni": {
405+
},
406+
"patroni": {
407407
"slots": {
408408
"first_slot": {
409409
"type": "physical"
@@ -414,7 +414,7 @@ def test_config_update(self):
414414
"retry_timeout": 9,
415415
"synchronous_mode": True,
416416
"failsafe_mode": True,
417-
}
417+
}
418418
}
419419
}
420420

@@ -517,7 +517,7 @@ def compare_config():
517517
pg_add_new_slots_patch = {
518518
"spec": {
519519
"patroni": {
520-
"slots": {
520+
"slots": {
521521
"test_slot": {
522522
"type": "logical",
523523
"database": "foo",
@@ -1667,19 +1667,18 @@ def test_owner_references(self):
16671667
k8s.api.custom_objects_api.delete_namespaced_custom_object(
16681668
"acid.zalan.do", "v1", self.test_namespace, "postgresqls", cluster_name)
16691669

1670-
# statefulset, pod disruption budget and secrets should be deleted via owner reference
1670+
# child resources with owner references should be deleted via owner references
16711671
self.eventuallyEqual(lambda: k8s.count_pods_with_label(cluster_label), 0, "Pods not deleted")
16721672
self.eventuallyEqual(lambda: k8s.count_statefulsets_with_label(cluster_label), 0, "Statefulset not deleted")
1673+
self.eventuallyEqual(lambda: k8s.count_services_with_label(cluster_label), 0, "Services not deleted")
1674+
self.eventuallyEqual(lambda: k8s.count_endpoints_with_label(cluster_label), 0, "Endpoints not deleted")
16731675
self.eventuallyEqual(lambda: k8s.count_pdbs_with_label(cluster_label), 0, "Pod disruption budget not deleted")
16741676
self.eventuallyEqual(lambda: k8s.count_secrets_with_label(cluster_label), 0, "Secrets were not deleted")
16751677

1676-
time.sleep(5) # wait for the operator to also delete the leftovers
1678+
time.sleep(5) # wait for the operator to also delete the PVCs
16771679

1678-
# pvcs and Patroni config service/endpoint should not be affected by owner reference
1679-
# but deleted by the operator almost immediately
1680+
# pvcs do not have an owner reference but will deleted by the operator almost immediately
16801681
self.eventuallyEqual(lambda: k8s.count_pvcs_with_label(cluster_label), 0, "PVCs not deleted")
1681-
self.eventuallyEqual(lambda: k8s.count_services_with_label(cluster_label), 0, "Patroni config service not deleted")
1682-
self.eventuallyEqual(lambda: k8s.count_endpoints_with_label(cluster_label), 0, "Patroni config endpoint not deleted")
16831682

16841683
# disable owner references in config
16851684
disable_owner_refs = {
@@ -2143,13 +2142,13 @@ def test_stream_resources(self):
21432142
# update the manifest with the streams section
21442143
patch_streaming_config = {
21452144
"spec": {
2146-
"patroni": {
2145+
"patroni": {
21472146
"slots": {
21482147
"manual_slot": {
21492148
"type": "physical"
21502149
}
21512150
}
2152-
},
2151+
},
21532152
"streams": [
21542153
{
21552154
"applicationId": "test-app",
@@ -2481,11 +2480,15 @@ def check_cluster_child_resources_owner_references(self, cluster_name, cluster_n
24812480
self.assertTrue(self.has_postgresql_owner_reference(svc.metadata.owner_references, inverse), "primary service owner reference check failed")
24822481
replica_svc = k8s.api.core_v1.read_namespaced_service(cluster_name + "-repl", cluster_namespace)
24832482
self.assertTrue(self.has_postgresql_owner_reference(replica_svc.metadata.owner_references, inverse), "replica service owner reference check failed")
2483+
config_svc = k8s.api.core_v1.read_namespaced_service(cluster_name + "-config", cluster_namespace)
2484+
self.assertTrue(self.has_postgresql_owner_reference(config_svc.metadata.owner_references, inverse), "config service owner reference check failed")
24842485

24852486
ep = k8s.api.core_v1.read_namespaced_endpoints(cluster_name, cluster_namespace)
24862487
self.assertTrue(self.has_postgresql_owner_reference(ep.metadata.owner_references, inverse), "primary endpoint owner reference check failed")
24872488
replica_ep = k8s.api.core_v1.read_namespaced_endpoints(cluster_name + "-repl", cluster_namespace)
2488-
self.assertTrue(self.has_postgresql_owner_reference(replica_ep.metadata.owner_references, inverse), "replica owner reference check failed")
2489+
self.assertTrue(self.has_postgresql_owner_reference(replica_ep.metadata.owner_references, inverse), "replica endpoint owner reference check failed")
2490+
config_ep = k8s.api.core_v1.read_namespaced_endpoints(cluster_name + "-config", cluster_namespace)
2491+
self.assertTrue(self.has_postgresql_owner_reference(config_ep.metadata.owner_references, inverse), "config endpoint owner reference check failed")
24892492

24902493
pdb = k8s.api.policy_v1.read_namespaced_pod_disruption_budget("postgres-{}-pdb".format(cluster_name), cluster_namespace)
24912494
self.assertTrue(self.has_postgresql_owner_reference(pdb.metadata.owner_references, inverse), "pod disruption owner reference check failed")

pkg/cluster/cluster.go

Lines changed: 30 additions & 107 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@ package cluster
33
// Postgres CustomResourceDefinition object i.e. Spilo
44

55
import (
6-
"context"
76
"database/sql"
87
"encoding/json"
98
"fmt"
@@ -15,6 +14,7 @@ import (
1514

1615
"github.com/sirupsen/logrus"
1716
acidv1 "github.com/zalando/postgres-operator/pkg/apis/acid.zalan.do/v1"
17+
zalandov1 "github.com/zalando/postgres-operator/pkg/apis/zalando.org/v1"
1818

1919
"github.com/zalando/postgres-operator/pkg/generated/clientset/versioned/scheme"
2020
"github.com/zalando/postgres-operator/pkg/spec"
@@ -30,7 +30,6 @@ import (
3030
appsv1 "k8s.io/api/apps/v1"
3131
batchv1 "k8s.io/api/batch/v1"
3232
v1 "k8s.io/api/core/v1"
33-
apipolicyv1 "k8s.io/api/policy/v1"
3433
policyv1 "k8s.io/api/policy/v1"
3534
rbacv1 "k8s.io/api/rbac/v1"
3635
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -62,9 +61,13 @@ type Config struct {
6261
type kubeResources struct {
6362
Services map[PostgresRole]*v1.Service
6463
Endpoints map[PostgresRole]*v1.Endpoints
64+
PatroniEndpoints map[string]*v1.Endpoints
65+
PatroniConfigMaps map[string]*v1.ConfigMap
6566
Secrets map[types.UID]*v1.Secret
6667
Statefulset *appsv1.StatefulSet
6768
PodDisruptionBudget *policyv1.PodDisruptionBudget
69+
LogicalBackupJob *batchv1.CronJob
70+
Streams map[string]*zalandov1.FabricEventStream
6871
//Pods are treated separately
6972
//PVCs are treated separately
7073
}
@@ -132,9 +135,12 @@ func New(cfg Config, kubeClient k8sutil.KubernetesClient, pgSpec acidv1.Postgres
132135
systemUsers: make(map[string]spec.PgUser),
133136
podSubscribers: make(map[spec.NamespacedName]chan PodEvent),
134137
kubeResources: kubeResources{
135-
Secrets: make(map[types.UID]*v1.Secret),
136-
Services: make(map[PostgresRole]*v1.Service),
137-
Endpoints: make(map[PostgresRole]*v1.Endpoints)},
138+
Secrets: make(map[types.UID]*v1.Secret),
139+
Services: make(map[PostgresRole]*v1.Service),
140+
Endpoints: make(map[PostgresRole]*v1.Endpoints),
141+
PatroniEndpoints: make(map[string]*v1.Endpoints),
142+
PatroniConfigMaps: make(map[string]*v1.ConfigMap),
143+
Streams: make(map[string]*zalandov1.FabricEventStream)},
138144
userSyncStrategy: users.DefaultUserSyncStrategy{
139145
PasswordEncryption: passwordEncryption,
140146
RoleDeletionSuffix: cfg.OpConfig.RoleDeletionSuffix,
@@ -357,6 +363,11 @@ func (c *Cluster) Create() (err error) {
357363
c.logger.Infof("pods are ready")
358364
c.eventRecorder.Event(c.GetReference(), v1.EventTypeNormal, "StatefulSet", "Pods are ready")
359365

366+
// sync resources created by Patroni
367+
if err = c.syncPatroniResources(); err != nil {
368+
c.logger.Warnf("Patroni resources not yet synced: %v", err)
369+
}
370+
360371
// create database objects unless we are running without pods or disabled
361372
// that feature explicitly
362373
if !(c.databaseAccessDisabled() || c.getNumberOfInstances(&c.Spec) <= 0 || c.Spec.StandbyCluster != nil) {
@@ -382,10 +393,6 @@ func (c *Cluster) Create() (err error) {
382393
c.logger.Info("a k8s cron job for logical backup has been successfully created")
383394
}
384395

385-
if err := c.listResources(); err != nil {
386-
c.logger.Errorf("could not list resources: %v", err)
387-
}
388-
389396
// Create connection pooler deployment and services if necessary. Since we
390397
// need to perform some operations with the database itself (e.g. install
391398
// lookup function), do it as the last step, when everything is available.
@@ -410,6 +417,10 @@ func (c *Cluster) Create() (err error) {
410417
}
411418
}
412419

420+
if err := c.listResources(); err != nil {
421+
c.logger.Errorf("could not list resources: %v", err)
422+
}
423+
413424
return nil
414425
}
415426

@@ -856,7 +867,7 @@ func (c *Cluster) compareLogicalBackupJob(cur, new *batchv1.CronJob) (match bool
856867
return true, ""
857868
}
858869

859-
func (c *Cluster) comparePodDisruptionBudget(cur, new *apipolicyv1.PodDisruptionBudget) (bool, string) {
870+
func (c *Cluster) comparePodDisruptionBudget(cur, new *policyv1.PodDisruptionBudget) (bool, string) {
860871
//TODO: improve comparison
861872
if !reflect.DeepEqual(new.Spec, cur.Spec) {
862873
return false, "new PDB's spec does not match the current one"
@@ -977,6 +988,12 @@ func (c *Cluster) Update(oldSpec, newSpec *acidv1.Postgresql) error {
977988
updateFailed = true
978989
}
979990

991+
// Patroni service and endpoints / config maps
992+
if err := c.syncPatroniResources(); err != nil {
993+
c.logger.Errorf("could not sync services: %v", err)
994+
updateFailed = true
995+
}
996+
980997
// Users
981998
func() {
982999
// check if users need to be synced during update
@@ -1191,7 +1208,6 @@ func (c *Cluster) Delete() error {
11911208
}
11921209

11931210
for _, role := range []PostgresRole{Master, Replica} {
1194-
11951211
if !c.patroniKubernetesUseConfigMaps() {
11961212
if err := c.deleteEndpoint(role); err != nil {
11971213
anyErrors = true
@@ -1207,10 +1223,10 @@ func (c *Cluster) Delete() error {
12071223
}
12081224
}
12091225

1210-
if err := c.deletePatroniClusterObjects(); err != nil {
1226+
if err := c.deletePatroniResources(); err != nil {
12111227
anyErrors = true
1212-
c.logger.Warningf("could not remove leftover patroni objects; %v", err)
1213-
c.eventRecorder.Eventf(c.GetReference(), v1.EventTypeWarning, "Delete", "could not remove leftover patroni objects; %v", err)
1228+
c.logger.Warningf("could not delete all Patroni resources: %v", err)
1229+
c.eventRecorder.Eventf(c.GetReference(), v1.EventTypeWarning, "Delete", "could not delete all Patroni resources: %v", err)
12141230
}
12151231

12161232
// Delete connection pooler objects anyway, even if it's not mentioned in the
@@ -1742,96 +1758,3 @@ func (c *Cluster) Lock() {
17421758
func (c *Cluster) Unlock() {
17431759
c.mu.Unlock()
17441760
}
1745-
1746-
type simpleActionWithResult func()
1747-
1748-
type clusterObjectGet func(name string) (spec.NamespacedName, error)
1749-
1750-
type clusterObjectDelete func(name string) error
1751-
1752-
func (c *Cluster) deletePatroniClusterObjects() error {
1753-
// TODO: figure out how to remove leftover patroni objects in other cases
1754-
var actionsList []simpleActionWithResult
1755-
1756-
if !c.patroniUsesKubernetes() {
1757-
c.logger.Infof("not cleaning up Etcd Patroni objects on cluster delete")
1758-
}
1759-
1760-
actionsList = append(actionsList, c.deletePatroniClusterServices)
1761-
if c.patroniKubernetesUseConfigMaps() {
1762-
actionsList = append(actionsList, c.deletePatroniClusterConfigMaps)
1763-
} else {
1764-
actionsList = append(actionsList, c.deletePatroniClusterEndpoints)
1765-
}
1766-
1767-
c.logger.Debugf("removing leftover Patroni objects (endpoints / services and configmaps)")
1768-
for _, deleter := range actionsList {
1769-
deleter()
1770-
}
1771-
return nil
1772-
}
1773-
1774-
func deleteClusterObject(
1775-
get clusterObjectGet,
1776-
del clusterObjectDelete,
1777-
objType string,
1778-
clusterName string,
1779-
logger *logrus.Entry) {
1780-
for _, suffix := range patroniObjectSuffixes {
1781-
name := fmt.Sprintf("%s-%s", clusterName, suffix)
1782-
1783-
namespacedName, err := get(name)
1784-
if err == nil {
1785-
logger.Debugf("deleting %s %q",
1786-
objType, namespacedName)
1787-
1788-
if err = del(name); err != nil {
1789-
logger.Warningf("could not delete %s %q: %v",
1790-
objType, namespacedName, err)
1791-
}
1792-
1793-
} else if !k8sutil.ResourceNotFound(err) {
1794-
logger.Warningf("could not fetch %s %q: %v",
1795-
objType, namespacedName, err)
1796-
}
1797-
}
1798-
}
1799-
1800-
func (c *Cluster) deletePatroniClusterServices() {
1801-
get := func(name string) (spec.NamespacedName, error) {
1802-
svc, err := c.KubeClient.Services(c.Namespace).Get(context.TODO(), name, metav1.GetOptions{})
1803-
return util.NameFromMeta(svc.ObjectMeta), err
1804-
}
1805-
1806-
deleteServiceFn := func(name string) error {
1807-
return c.KubeClient.Services(c.Namespace).Delete(context.TODO(), name, c.deleteOptions)
1808-
}
1809-
1810-
deleteClusterObject(get, deleteServiceFn, "service", c.Name, c.logger)
1811-
}
1812-
1813-
func (c *Cluster) deletePatroniClusterEndpoints() {
1814-
get := func(name string) (spec.NamespacedName, error) {
1815-
ep, err := c.KubeClient.Endpoints(c.Namespace).Get(context.TODO(), name, metav1.GetOptions{})
1816-
return util.NameFromMeta(ep.ObjectMeta), err
1817-
}
1818-
1819-
deleteEndpointFn := func(name string) error {
1820-
return c.KubeClient.Endpoints(c.Namespace).Delete(context.TODO(), name, c.deleteOptions)
1821-
}
1822-
1823-
deleteClusterObject(get, deleteEndpointFn, "endpoint", c.Name, c.logger)
1824-
}
1825-
1826-
func (c *Cluster) deletePatroniClusterConfigMaps() {
1827-
get := func(name string) (spec.NamespacedName, error) {
1828-
cm, err := c.KubeClient.ConfigMaps(c.Namespace).Get(context.TODO(), name, metav1.GetOptions{})
1829-
return util.NameFromMeta(cm.ObjectMeta), err
1830-
}
1831-
1832-
deleteConfigMapFn := func(name string) error {
1833-
return c.KubeClient.ConfigMaps(c.Namespace).Delete(context.TODO(), name, c.deleteOptions)
1834-
}
1835-
1836-
deleteClusterObject(get, deleteConfigMapFn, "configmap", c.Name, c.logger)
1837-
}

pkg/cluster/connection_pooler.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -655,7 +655,7 @@ func (c *Cluster) deleteConnectionPoolerSecret() (err error) {
655655
if err != nil {
656656
c.logger.Debugf("could not get connection pooler secret %s: %v", secretName, err)
657657
} else {
658-
if err = c.deleteSecret(secret.UID, *secret); err != nil {
658+
if err = c.deleteSecret(secret.UID); err != nil {
659659
return fmt.Errorf("could not delete pooler secret: %v", err)
660660
}
661661
}

pkg/cluster/k8sres.go

Lines changed: 5 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -79,19 +79,13 @@ func (c *Cluster) statefulSetName() string {
7979
return c.Name
8080
}
8181

82-
func (c *Cluster) endpointName(role PostgresRole) string {
83-
name := c.Name
84-
if role == Replica {
85-
name = fmt.Sprintf("%s-%s", name, "repl")
86-
}
87-
88-
return name
89-
}
90-
9182
func (c *Cluster) serviceName(role PostgresRole) string {
9283
name := c.Name
93-
if role == Replica {
84+
switch role {
85+
case Replica:
9486
name = fmt.Sprintf("%s-%s", name, "repl")
87+
case Patroni:
88+
name = fmt.Sprintf("%s-%s", name, "config")
9589
}
9690

9791
return name
@@ -2072,7 +2066,7 @@ func (c *Cluster) getCustomServiceAnnotations(role PostgresRole, spec *acidv1.Po
20722066
func (c *Cluster) generateEndpoint(role PostgresRole, subsets []v1.EndpointSubset) *v1.Endpoints {
20732067
endpoints := &v1.Endpoints{
20742068
ObjectMeta: metav1.ObjectMeta{
2075-
Name: c.endpointName(role),
2069+
Name: c.serviceName(role),
20762070
Namespace: c.Namespace,
20772071
Annotations: c.annotationsSet(nil),
20782072
Labels: c.roleLabelsSet(true, role),

0 commit comments

Comments
 (0)