Skip to content

K8SPSMDB-1211: handle FULL CLUSTER CRASH error during the restore #1926

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 29 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
e2b7e97
K8SPSMDB-1211: handle `FULL CLUSTER CRASH` error during the restore
pooknull May 16, 2025
47073d9
Merge remote-tracking branch 'origin/main' into K8SPSMDB-1211
pooknull May 19, 2025
49cc044
remove unused comment
pooknull May 19, 2025
19de9e6
fix lint
pooknull May 19, 2025
9bf2482
remove common reconciler
pooknull May 20, 2025
879163f
fix
pooknull May 20, 2025
f87bcc8
fix unit-test
pooknull May 20, 2025
a566737
fix
pooknull May 21, 2025
20e0558
Merge remote-tracking branch 'origin/main' into K8SPSMDB-1211
pooknull May 21, 2025
35a2e22
fix manifests
pooknull May 21, 2025
81186a8
fix tests
pooknull May 21, 2025
2614d85
Merge branch 'main' into K8SPSMDB-1211
hors May 21, 2025
dc8663f
small fix
pooknull May 22, 2025
0a442b9
Merge branch 'main' into K8SPSMDB-1211
pooknull May 22, 2025
b433511
add sleep
pooknull May 23, 2025
81d2898
fix tests
pooknull May 23, 2025
3cd0736
Merge branch 'main' into K8SPSMDB-1211
hors May 23, 2025
f9354c4
wait after adding resync annotation
pooknull May 26, 2025
788505c
backoff wait after adding resync
pooknull May 27, 2025
915ffc8
remove wait and fix tests
pooknull May 27, 2025
9f69da2
Merge remote-tracking branch 'origin/main' into K8SPSMDB-1211
pooknull May 27, 2025
aaa227b
fix merge
pooknull May 27, 2025
82c139f
fix manifests
pooknull May 28, 2025
050f5ef
Merge remote-tracking branch 'origin/main' into K8SPSMDB-1211
pooknull May 28, 2025
ae459c2
fix merge
pooknull May 28, 2025
ced15a2
fix merge
pooknull May 28, 2025
756aefe
fix arbiter
pooknull May 28, 2025
6995236
Merge branch 'main' into K8SPSMDB-1211
pooknull May 28, 2025
fd8c741
Merge remote-tracking branch 'origin/main' into K8SPSMDB-1211
pooknull May 29, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions e2e-tests/demand-backup-incremental-sharded/run
Original file line number Diff line number Diff line change
Expand Up @@ -49,14 +49,15 @@ run_recovery_check() {

# we don't wait for cluster readiness here because the annotation gets removed then
wait_restore "${backup_name}" "${cluster}" "ready" "0" "3000"
kubectl_bin get psmdb ${cluster} -o yaml
if [ $(kubectl_bin get psmdb ${cluster} -o yaml | yq '.metadata.annotations."percona.com/resync-pbm"') == null ]; then

if [ "$(kubectl_bin get psmdb ${cluster} -o yaml | yq '.metadata.annotations."percona.com/resync-pbm"')" == null ] && [ "$(kubectl_bin get psmdb ${cluster} -o yaml | yq '.metadata.annotations."percona.com/resync-in-progress"')" == null ]; then
log "psmdb/${cluster} should be annotated with percona.com/resync-pbm after a incremental restore"
exit 1
fi
echo

wait_cluster_consistency ${cluster} 60
sleep 5
wait_for_pbm_operations ${cluster}

if [[ $base == true ]]; then
Expand Down
3 changes: 2 additions & 1 deletion e2e-tests/demand-backup-incremental/run
Original file line number Diff line number Diff line change
Expand Up @@ -50,13 +50,14 @@ run_recovery_check() {
# we don't wait for cluster readiness here because the annotation gets removed then
wait_restore "${backup_name}" "${cluster}" "ready" "0" "1800"

if [ $(kubectl_bin get psmdb ${cluster} -o yaml | yq '.metadata.annotations."percona.com/resync-pbm"') == null ]; then
if [ "$(kubectl_bin get psmdb ${cluster} -o yaml | yq '.metadata.annotations."percona.com/resync-pbm"')" == null ] && [ "$(kubectl_bin get psmdb ${cluster} -o yaml | yq '.metadata.annotations."percona.com/resync-in-progress"')" == null ]; then
log "psmdb/${cluster} should be annotated with percona.com/resync-pbm after a incremental restore"
exit 1
fi
echo

wait_cluster_consistency ${cluster}
sleep 5
wait_for_pbm_operations ${cluster}

if [[ $base == true ]]; then
Expand Down
6 changes: 3 additions & 3 deletions pkg/controller/perconaservermongodb/balancer.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@ import (
"context"
"time"

"github.com/percona/percona-server-mongodb-operator/pkg/psmdb"
"github.com/pkg/errors"
corev1 "k8s.io/api/core/v1"
k8sErrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/types"
logf "sigs.k8s.io/controller-runtime/pkg/log"

api "github.com/percona/percona-server-mongodb-operator/pkg/apis/psmdb/v1"
"github.com/percona/percona-server-mongodb-operator/pkg/psmdb"
)

func (r *ReconcilePerconaServerMongoDB) enableBalancerIfNeeded(ctx context.Context, cr *api.PerconaServerMongoDB) error {
Expand Down Expand Up @@ -85,7 +85,7 @@ func (r *ReconcilePerconaServerMongoDB) enableBalancerIfNeeded(ctx context.Conte
}
}

mongosSession, err := r.mongosClientWithRole(ctx, cr, api.RoleClusterAdmin)
mongosSession, err := r.MongoClient().Mongos(ctx, cr, api.RoleClusterAdmin)
if err != nil {
return errors.Wrap(err, "failed to get mongos connection")
}
Expand Down Expand Up @@ -141,7 +141,7 @@ func (r *ReconcilePerconaServerMongoDB) disableBalancer(ctx context.Context, cr
return errors.Wrapf(err, "get mongos statefulset %s", msSts.Name)
}

mongosSession, err := r.mongosClientWithRole(ctx, cr, api.RoleClusterAdmin)
mongosSession, err := r.MongoClient().Mongos(ctx, cr, api.RoleClusterAdmin)
if err != nil {
return errors.Wrap(err, "failed to get mongos connection")
}
Expand Down
73 changes: 0 additions & 73 deletions pkg/controller/perconaservermongodb/connections.go

This file was deleted.

10 changes: 5 additions & 5 deletions pkg/controller/perconaservermongodb/connections_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ func TestConnectionLeaks(t *testing.T) {
connectionCount := new(int)

r := buildFakeClient(obj...)
r.mongoClientProvider = &fakeMongoClientProvider{pods: rsPods, cr: cr, connectionCount: connectionCount}
r.MongoProviderBase = psmdb.NewProviderBase(r.client, &fakeMongoClientProvider{pods: rsPods, cr: cr, connectionCount: connectionCount})
r.serverVersion = &version.ServerVersion{Platform: version.PlatformKubernetes}
r.crons = NewCronRegistry()

Expand Down Expand Up @@ -395,18 +395,18 @@ func (g *fakeMongoClientProvider) Mongos(ctx context.Context, cr *api.PerconaSer
return &fakeMongoClient{pods: g.pods, cr: g.cr, connectionCount: g.connectionCount, Client: fakeClient}, nil
}

func (g *fakeMongoClientProvider) Standalone(ctx context.Context, cr *api.PerconaServerMongoDB, role api.SystemUserRole, host string, tlsEnabled bool) (mongo.Client, error) {
func (g *fakeMongoClientProvider) Standalone(ctx context.Context, cr *api.PerconaServerMongoDB, rs *api.ReplsetSpec, role api.SystemUserRole, pod corev1.Pod) (mongo.Client, error) {
*g.connectionCount++

fakeClient := mongoFake.NewClient()
return &fakeMongoClient{pods: g.pods, cr: g.cr, connectionCount: g.connectionCount, Client: fakeClient, host: host}, nil
return &fakeMongoClient{pods: g.pods, cr: g.cr, connectionCount: g.connectionCount, Client: fakeClient, pod: &pod}, nil
}

type fakeMongoClient struct {
pods []client.Object
cr *api.PerconaServerMongoDB
connectionCount *int
host string
pod *corev1.Pod
mongo.Client
}

Expand Down Expand Up @@ -522,7 +522,7 @@ func (c *fakeMongoClient) IsMaster(ctx context.Context) (*mongo.IsMasterResp, er
if err := c.cr.CheckNSetDefaults(ctx, version.PlatformKubernetes); err != nil {
return nil, err
}
if c.host == psmdb.GetAddr(c.cr, c.pods[0].GetName(), c.cr.Spec.Replsets[0].Name, c.cr.Spec.Replsets[0].GetPort()) {
if c.pod.GetName() == c.pods[0].GetName() {
isMaster = true
}
return &mongo.IsMasterResp{
Expand Down
10 changes: 6 additions & 4 deletions pkg/controller/perconaservermongodb/custom_users.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,9 @@ func (r *ReconcilePerconaServerMongoDB) reconcileCustomUsers(ctx context.Context
var err error
var mongoCli mongo.Client
if cr.Spec.Sharding.Enabled {
mongoCli, err = r.mongosClientWithRole(ctx, cr, api.RoleUserAdmin)
mongoCli, err = r.MongoClient().Mongos(ctx, cr, api.RoleUserAdmin)
} else {
mongoCli, err = r.mongoClientWithRole(ctx, cr, cr.Spec.Replsets[0], api.RoleUserAdmin)
mongoCli, err = r.MongoClient().Mongo(ctx, cr, cr.Spec.Replsets[0], api.RoleUserAdmin)
}
if err != nil {
return errors.Wrap(err, "failed to get mongo client")
Expand Down Expand Up @@ -310,7 +310,8 @@ func updatePass(
user *api.User,
userInfo *mongo.User,
secret *corev1.Secret,
annotationKey, passKey string) error {
annotationKey, passKey string,
) error {
log := logf.FromContext(ctx)

if userInfo == nil || user.IsExternalDB() {
Expand Down Expand Up @@ -395,7 +396,8 @@ func createUser(
mongoCli mongo.Client,
user *api.User,
secret *corev1.Secret,
annotationKey, passKey string) error {
annotationKey, passKey string,
) error {
log := logf.FromContext(ctx)

roles := make([]mongo.Role, 0)
Expand Down
6 changes: 3 additions & 3 deletions pkg/controller/perconaservermongodb/fcv.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ import (
)

func (r *ReconcilePerconaServerMongoDB) getFCV(ctx context.Context, cr *api.PerconaServerMongoDB) (string, error) {
c, err := r.mongoClientWithRole(ctx, cr, cr.Spec.Replsets[0], api.RoleClusterAdmin)
c, err := r.MongoClient().Mongo(ctx, cr, cr.Spec.Replsets[0], api.RoleClusterAdmin)
if err != nil {
return "", errors.Wrap(err, "failed to get connection")
}
Expand Down Expand Up @@ -40,9 +40,9 @@ func (r *ReconcilePerconaServerMongoDB) setFCV(ctx context.Context, cr *api.Perc
var connErr error

if cr.Spec.Sharding.Enabled {
cli, connErr = r.mongosClientWithRole(ctx, cr, api.RoleClusterAdmin)
cli, connErr = r.MongoClient().Mongos(ctx, cr, api.RoleClusterAdmin)
} else {
cli, connErr = r.mongoClientWithRole(ctx, cr, cr.Spec.Replsets[0], api.RoleClusterAdmin)
cli, connErr = r.MongoClient().Mongo(ctx, cr, cr.Spec.Replsets[0], api.RoleClusterAdmin)
}

if connErr != nil {
Expand Down
2 changes: 1 addition & 1 deletion pkg/controller/perconaservermongodb/finalizers.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ func (r *ReconcilePerconaServerMongoDB) checkFinalizers(ctx context.Context, cr
}

func (r *ReconcilePerconaServerMongoDB) deleteAllPITRChunks(ctx context.Context, cr *api.PerconaServerMongoDB) error {
pbmc, err := r.newPBM(ctx, r.client, cr)
pbmc, err := r.newPBMFunc(ctx, r.client, cr)
if err != nil {
return errors.Wrap(err, "new pbm")
}
Expand Down
16 changes: 8 additions & 8 deletions pkg/controller/perconaservermongodb/mgo.go
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ func (r *ReconcilePerconaServerMongoDB) reconcileCluster(ctx context.Context, cr
}
}

cli, err := r.mongoClientWithRole(ctx, cr, replset, api.RoleClusterAdmin)
cli, err := r.MongoClient().Mongo(ctx, cr, replset, api.RoleClusterAdmin)
if err != nil {
if cr.Spec.Unmanaged {
return api.AppStateInit, nil, nil
Expand Down Expand Up @@ -185,7 +185,7 @@ func (r *ReconcilePerconaServerMongoDB) reconcileCluster(ctx context.Context, cr
replset.ClusterRole == api.ClusterRoleShardSvr &&
len(mongosPods) > 0 && cr.Spec.Sharding.Mongos.Size > 0 {

mongosSession, err := r.mongosClientWithRole(ctx, cr, api.RoleClusterAdmin)
mongosSession, err := r.MongoClient().Mongos(ctx, cr, api.RoleClusterAdmin)
if err != nil {
return api.AppStateError, nil, errors.Wrap(err, "failed to get mongos connection")
}
Expand Down Expand Up @@ -569,7 +569,7 @@ func (r *ReconcilePerconaServerMongoDB) removeRSFromShard(ctx context.Context, c
return nil
}

cli, err := r.mongosClientWithRole(ctx, cr, api.RoleClusterAdmin)
cli, err := r.MongoClient().Mongos(ctx, cr, api.RoleClusterAdmin)
if err != nil {
return errors.Errorf("failed to get mongos connection: %v", err)
}
Expand Down Expand Up @@ -619,7 +619,7 @@ func (r *ReconcilePerconaServerMongoDB) handleRsAddToShard(ctx context.Context,
return errors.Wrapf(err, "get rsPod %s host", rspod.Name)
}

cli, err := r.mongosClientWithRole(ctx, cr, api.RoleClusterAdmin)
cli, err := r.MongoClient().Mongos(ctx, cr, api.RoleClusterAdmin)
if err != nil {
return errors.Wrap(err, "failed to get mongos client")
}
Expand Down Expand Up @@ -722,7 +722,7 @@ func (r *ReconcilePerconaServerMongoDB) handleReplsetInit(ctx context.Context, c
time.Sleep(time.Second * 5)

log.Info("creating user admin", "replset", replsetName, "pod", pod.Name, "user", api.RoleUserAdmin)
userAdmin, err := getInternalCredentials(ctx, r.client, cr, api.RoleUserAdmin)
userAdmin, err := psmdb.GetCredentials(ctx, r.client, cr, api.RoleUserAdmin)
if err != nil {
return nil, nil, errors.Wrap(err, "failed to get userAdmin credentials")
}
Expand Down Expand Up @@ -755,7 +755,7 @@ func (r *ReconcilePerconaServerMongoDB) handleReplicaSetNoPrimary(ctx context.Co
}

log.Info("Connecting to pod", "pod", pod.Name, "user", api.RoleClusterAdmin)
cli, err := r.standaloneClientWithRole(ctx, cr, replset, api.RoleClusterAdmin, pod)
cli, err := r.MongoClient().Standalone(ctx, cr, replset, api.RoleClusterAdmin, pod)
if err != nil {
return errors.Wrap(err, "get standalone mongo client")
}
Expand Down Expand Up @@ -920,7 +920,7 @@ func compareRoles(x []mongo.Role, y []mongo.Role) bool {
func (r *ReconcilePerconaServerMongoDB) createOrUpdateSystemUsers(ctx context.Context, cr *api.PerconaServerMongoDB, replset *api.ReplsetSpec) error {
log := logf.FromContext(ctx)

cli, err := r.mongoClientWithRole(ctx, cr, replset, api.RoleUserAdmin)
cli, err := r.MongoClient().Mongo(ctx, cr, replset, api.RoleUserAdmin)
if err != nil {
return errors.Wrap(err, "failed to get mongo client")
}
Expand Down Expand Up @@ -1011,7 +1011,7 @@ func (r *ReconcilePerconaServerMongoDB) createOrUpdateSystemUsers(ctx context.Co
}

for _, role := range users {
creds, err := getInternalCredentials(ctx, r.client, cr, role)
creds, err := psmdb.GetCredentials(ctx, r.client, cr, role)
if err != nil {
log.Error(err, "failed to get credentials", "role", role)
continue
Expand Down
21 changes: 11 additions & 10 deletions pkg/controller/perconaservermongodb/psmdb_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,13 +89,14 @@ func newReconciler(mgr manager.Manager) (reconcile.Reconciler, error) {
}

return &ReconcilePerconaServerMongoDB{
MongoProviderBase: psmdb.NewProviderBase(client, nil),
client: client,
scheme: mgr.GetScheme(),
scheme: client.Scheme(),
newPBMFunc: backup.NewPBM,
serverVersion: sv,
reconcileIn: time.Second * 5,
crons: NewCronRegistry(),
lockers: newLockStore(),
newPBM: backup.NewPBM,
restConfig: mgr.GetConfig(),
newCertManagerCtrlFunc: tls.NewCertManagerController,

Expand Down Expand Up @@ -173,21 +174,21 @@ func NewCronRegistry() CronRegistry {

// ReconcilePerconaServerMongoDB reconciles a PerconaServerMongoDB object
type ReconcilePerconaServerMongoDB struct {
psmdb.MongoProviderBase
// This client, initialized using mgr.Client() above, is a split client
// that reads objects from the cache and writes to the apiserver
client client.Client
scheme *runtime.Scheme
restConfig *rest.Config

crons CronRegistry
clientcmd *clientcmd.Client
serverVersion *version.ServerVersion
reconcileIn time.Duration
mongoClientProvider MongoClientProvider
crons CronRegistry
clientcmd *clientcmd.Client
serverVersion *version.ServerVersion
reconcileIn time.Duration

newCertManagerCtrlFunc tls.NewCertManagerControllerFunc
newPBMFunc backup.NewPBMFunc

newPBM backup.NewPBMFunc
newCertManagerCtrlFunc tls.NewCertManagerControllerFunc

initImage string

Expand Down Expand Up @@ -866,7 +867,7 @@ func (r *ReconcilePerconaServerMongoDB) checkIfUserDataExistInRS(ctx context.Con
return errors.Wrap(err, "failed to set port")
}

mc, err := r.mongoClientWithRole(ctx, cr, rs, api.RoleClusterAdmin)
mc, err := r.MongoClient().Mongo(ctx, cr, rs, api.RoleClusterAdmin)
if err != nil {
return errors.Wrap(err, "dial:")
}
Expand Down
Loading
Loading