Skip to content

Commit 27e706a

Browse files
committed
Bug#29669099 THERE IS CHANCE THAT NODE WILL NOT GET RECOVERED POST CLONE FAIL
Bug#29944828 POST CLONE FAILURE DISTRIBUTED RECOVERY DOESN'T TRY IN CASE OF PLUGIN DIFFERENCE When group replication is recovering and there is a clone issue, only in some cases does it make sense to fallback to distributed recovery. In this patch we now make the recovery process fallback only when the clone plugin did not removed the local server data yet. ReviewBoard: 23789
1 parent e3eae96 commit 27e706a

12 files changed

+517
-9
lines changed

include/mysql/group_replication_priv.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -210,4 +210,9 @@ unsigned long get_max_slave_max_allowed_packet();
210210
*/
211211
bool is_server_restarting_after_clone();
212212

213+
/**
214+
@returns if the server already dropped its data when cloning
215+
*/
216+
bool is_server_data_dropped();
217+
213218
#endif /* GROUP_REPLICATION_PRIV_INCLUDE */
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
include/group_replication.inc
2+
Warnings:
3+
Note #### Sending passwords in plain text without SSL/TLS is extremely insecure.
4+
Note #### Storing MySQL user name or password information in the master info repository is not secure and is therefore not recommended. Please consider using the USER and PASSWORD connection options for START SLAVE; see the 'START SLAVE Syntax' in the MySQL Manual for more information.
5+
[connection server1]
6+
7+
############################################################
8+
# 1. Install clone plugin on server1.
9+
[connection server1]
10+
INSTALL PLUGIN clone SONAME 'CLONE_PLUGIN';
11+
SET GLOBAL clone_ddl_timeout = 0;
12+
13+
############################################################
14+
# 2. Bootstrap server1 and add some data
15+
include/start_and_bootstrap_group_replication.inc
16+
CREATE TABLE t1 (c1 INT NOT NULL PRIMARY KEY) ENGINE=InnoDB;
17+
INSERT INTO t1 VALUES (1);
18+
INSERT INTO t1 VALUES (2);
19+
20+
#######################################################################
21+
# 3. Restart server 2 with a monitoring process (mysqld_safe) if needed
22+
[connection server2]
23+
include/spawn_monitoring_process.inc
24+
25+
############################################################
26+
# 4. Setup the server so group replication starts on boot
27+
# Install the Clone plugin
28+
INSTALL PLUGIN clone SONAME 'CLONE_PLUGIN';
29+
30+
############################################################
31+
# 5. On a empty server2 start group replication
32+
# Pause it and execute a DDL query on the donor
33+
# The clone process should fail on the joiner
34+
SET GLOBAL group_replication_clone_threshold= 1;
35+
SET GLOBAL clone_autotune_concurrency = OFF;
36+
SET GLOBAL clone_max_concurrency = 1;
37+
SET GLOBAL clone_ddl_timeout = 0;
38+
SET @@GLOBAL.DEBUG='+d,gr_clone_wait';
39+
START GROUP_REPLICATION;
40+
SET DEBUG_SYNC = 'now WAIT_FOR gr_clone_paused';
41+
SET @@GLOBAL.DEBUG='-d,gr_clone_wait';
42+
[connection server_1_1]
43+
ALTER TABLE t1 ADD COLUMN col2 int, ALGORITHM=COPY;
44+
[connection server1]
45+
[connection server2]
46+
SET DEBUG_SYNC = 'now SIGNAL gr_clone_continue';
47+
include/gr_wait_for_member_state.inc
48+
include/assert.inc [Clone should have failed]
49+
include/assert_grep.inc [Clone failed and no recovery is possible]
50+
51+
############################################################
52+
# 6. Restart Group replication
53+
# The clone process should now be successful
54+
include/stop_group_replication.inc
55+
START GROUP_REPLICATION;
56+
include/rpl_reconnect.inc
57+
include/gr_wait_for_member_state.inc
58+
include/diff_tables.inc [server1:test.t1 ,server2:test.t1]
59+
60+
############################################################
61+
# 7. Cleanup
62+
SET GLOBAL group_replication_clone_threshold= CLONE_THRESHOLD_VALUE;
63+
RESET PERSIST IF EXISTS group_replication_group_name;
64+
RESET PERSIST IF EXISTS group_replication_local_address;
65+
RESET PERSIST IF EXISTS group_replication_group_seeds;
66+
RESET PERSIST IF EXISTS group_replication_start_on_boot;
67+
SET GLOBAL group_replication_start_on_boot= START_ON_BOOT_VALUE;
68+
[connection server_1_1]
69+
DROP TABLE t1;
70+
include/rpl_sync.inc
71+
[connection server2]
72+
include/clean_monitoring_process.inc
73+
set session sql_log_bin=0;
74+
call mtr.add_suppression("Clone removing all user data for provisioning: Started");
75+
call mtr.add_suppression("Clone removing all user data for provisioning: Finished");
76+
call mtr.add_suppression("Due to the number of missing transactions being higher than the configured threshold of 1, this member will start distributed recovery using clone.");
77+
call mtr.add_suppression("Internal query: CLONE INSTANCE FROM \'root\'@\'127.0.0.1\':[0-9]+ IDENTIFIED BY \'\\*\\*\\*\\*\\*\' REQUIRE NO SSL; result in error. Error number:*");
78+
call mtr.add_suppression("There was an issue when cloning from another server: Error number: 3862 Error message: Clone Donor Error: 3633 : Concurrent DDL is performed during clone operation. Please try again.");
79+
call mtr.add_suppression("Due to a critical cloning error or lack of donors, distributed recovery cannot be executed. The member will now leave the group.");
80+
call mtr.add_suppression("Skipping leave operation: concurrent attempt to leave the group is on-going.");
81+
set session sql_log_bin=1;
82+
[connection server1]
83+
UNINSTALL PLUGIN clone;
84+
[connection server2]
85+
UNINSTALL PLUGIN clone;
86+
include/group_replication_end.inc
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
include/group_replication.inc
2+
Warnings:
3+
Note #### Sending passwords in plain text without SSL/TLS is extremely insecure.
4+
Note #### Storing MySQL user name or password information in the master info repository is not secure and is therefore not recommended. Please consider using the USER and PASSWORD connection options for START SLAVE; see the 'START SLAVE Syntax' in the MySQL Manual for more information.
5+
[connection server1]
6+
7+
############################################################
8+
# 1. Install clone and keyring plugins on server1.
9+
[connection server1]
10+
INSTALL PLUGIN clone SONAME 'CLONE_PLUGIN';
11+
INSTALL PLUGIN keyring_file SONAME 'KEYRING_PLUGIN';
12+
13+
############################################################
14+
# 2. Bootstrap server1 and add some data
15+
include/start_and_bootstrap_group_replication.inc
16+
CREATE TABLE t1 (c1 INT NOT NULL PRIMARY KEY) ENGINE=InnoDB;
17+
INSERT INTO t1 VALUES (1);
18+
INSERT INTO t1 VALUES (2);
19+
20+
#######################################################################
21+
# 3. Restart server 2 with a monitoring process (mysqld_safe) if needed
22+
[connection server2]
23+
include/spawn_monitoring_process.inc
24+
25+
############################################################
26+
# 4. Install the clone plugin on server2.
27+
INSTALL PLUGIN clone SONAME 'CLONE_PLUGIN';
28+
29+
############################################################
30+
# 5. On a empty server2 start group replication
31+
# Clone will fail since the donor has keyring plugin and
32+
# the joiner does not. Recovery will fallback to incremental.
33+
SET @group_replication_clone_threshold_saved = @@GLOBAL.group_replication_clone_threshold;
34+
SET GLOBAL group_replication_clone_threshold= 1;
35+
include/start_group_replication.inc
36+
include/diff_tables.inc [server1:test.t1 ,server2:test.t1]
37+
include/assert.inc [Clone should have failed]
38+
include/assert_grep.inc [Clone failed]
39+
40+
############################################################
41+
# 6. Cleanup
42+
SET GLOBAL group_replication_clone_threshold= @group_replication_clone_threshold_saved;
43+
DROP TABLE t1;
44+
include/rpl_sync.inc
45+
[connection server2]
46+
include/clean_monitoring_process.inc
47+
set session sql_log_bin=0;
48+
call mtr.add_suppression("Due to the number of missing transactions being higher than the configured threshold of 1, this member will start distributed recovery using clone.");
49+
call mtr.add_suppression("Internal query: CLONE INSTANCE FROM \'root\'@\'127.0.0.1\':[0-9]+ IDENTIFIED BY \'\\*\\*\\*\\*\\*\' REQUIRE NO SSL; result in error. Error number: 3870");
50+
call mtr.add_suppression("There was an issue when cloning from another server: Error number: 3870 Error message: Clone Donor plugin keyring_file is not active in Recipient.");
51+
call mtr.add_suppression("Due to some issue on the previous step distributed recovery is now executing: Incremental Recovery.");
52+
set session sql_log_bin=1;
53+
[connection server1]
54+
UNINSTALL PLUGIN clone;
55+
UNINSTALL PLUGIN keyring_file;
56+
[connection server2]
57+
UNINSTALL PLUGIN clone;
58+
include/group_replication_end.inc
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
$CLONE_PLUGIN_OPT
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
$CLONE_PLUGIN_OPT
Lines changed: 200 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,200 @@
1+
###############################################################################
2+
# This test evaluates that on critical scenarios like the execution of a
3+
# concurrent DDL query on the donor, the joiner cloning process will fail.
4+
# It also evaluates that on errors like this that happen when all the data was
5+
# already removed from the joiner, no distributed recovery attempts occur.
6+
#
7+
# 1. Install clone plugin on server1.
8+
# 2. Bootstrap server1 and add some data
9+
# 3. Restart server 2 with a monitoring process (mysqld_safe) if needed
10+
# 4. Setup the server so group replication starts on boot
11+
# Install the Clone plugin
12+
# 5. On a empty server2 start group replication
13+
# Pause it and execute a DDL query on the donor
14+
# The clone process should fail on the joiner
15+
# 6. Restart Group replication
16+
# The clone process should now be successful
17+
# 7. Cleanup
18+
#
19+
20+
--source include/have_debug.inc
21+
--source include/big_test.inc
22+
--source include/have_mysqld_monitoring_process.inc
23+
--source include/have_clone_plugin.inc
24+
--source include/have_group_replication_plugin.inc
25+
--let $rpl_skip_group_replication_start= 1
26+
--source include/group_replication.inc
27+
28+
--echo
29+
--echo ############################################################
30+
--echo # 1. Install clone plugin on server1.
31+
32+
--let $rpl_connection_name= server1
33+
--source include/rpl_connection.inc
34+
35+
--replace_result $CLONE_PLUGIN CLONE_PLUGIN
36+
--eval INSTALL PLUGIN clone SONAME '$CLONE_PLUGIN'
37+
SET GLOBAL clone_ddl_timeout = 0;
38+
39+
--echo
40+
--echo ############################################################
41+
--echo # 2. Bootstrap server1 and add some data
42+
43+
--source include/start_and_bootstrap_group_replication.inc
44+
45+
CREATE TABLE t1 (c1 INT NOT NULL PRIMARY KEY) ENGINE=InnoDB;
46+
INSERT INTO t1 VALUES (1);
47+
INSERT INTO t1 VALUES (2);
48+
49+
--echo
50+
--echo #######################################################################
51+
--echo # 3. Restart server 2 with a monitoring process (mysqld_safe) if needed
52+
53+
--let $rpl_connection_name= server2
54+
--source include/rpl_connection.inc
55+
56+
--let $_group_replication_local_address= `SELECT @@GLOBAL.group_replication_local_address`
57+
--let $_group_replication_group_seeds= `SELECT @@GLOBAL.group_replication_group_seeds`
58+
59+
--let $plugin_list= $GROUP_REPLICATION
60+
--source include/spawn_monitoring_process.inc
61+
62+
--echo
63+
--echo ############################################################
64+
--echo # 4. Setup the server so group replication starts on boot
65+
--echo # Install the Clone plugin
66+
67+
--disable_query_log
68+
--eval SET PERSIST group_replication_group_name= "$group_replication_group_name"
69+
--eval SET PERSIST group_replication_local_address= "$_group_replication_local_address"
70+
--eval SET PERSIST group_replication_group_seeds= "$_group_replication_group_seeds"
71+
72+
--let $_group_replication_start_on_boot_save= `SELECT @@GLOBAL.group_replication_start_on_boot`
73+
SET PERSIST group_replication_start_on_boot= ON;
74+
--enable_query_log
75+
76+
--replace_result $CLONE_PLUGIN CLONE_PLUGIN
77+
--eval INSTALL PLUGIN clone SONAME '$CLONE_PLUGIN'
78+
79+
--echo
80+
--echo ############################################################
81+
--echo # 5. On a empty server2 start group replication
82+
--echo # Pause it and execute a DDL query on the donor
83+
--echo # The clone process should fail on the joiner
84+
85+
--let $_group_replication_threshold_save= `SELECT @@GLOBAL.group_replication_clone_threshold`
86+
87+
SET GLOBAL group_replication_clone_threshold= 1;
88+
SET GLOBAL clone_autotune_concurrency = OFF;
89+
SET GLOBAL clone_max_concurrency = 1;
90+
SET GLOBAL clone_ddl_timeout = 0;
91+
92+
SET @@GLOBAL.DEBUG='+d,gr_clone_wait';
93+
94+
START GROUP_REPLICATION;
95+
96+
SET DEBUG_SYNC = 'now WAIT_FOR gr_clone_paused';
97+
SET @@GLOBAL.DEBUG='-d,gr_clone_wait';
98+
99+
--let $rpl_connection_name= server_1_1
100+
--source include/rpl_connection.inc
101+
102+
--send ALTER TABLE t1 ADD COLUMN col2 int, ALGORITHM=COPY
103+
104+
--let $rpl_connection_name= server1
105+
--source include/rpl_connection.inc
106+
107+
# Wait until the ALTER TABLE is waiting for clone to bail out.
108+
--let $wait_condition= SELECT PROCESSLIST_STATE="After create" FROM performance_schema.threads WHERE PROCESSLIST_INFO LIKE "ALTER TABLE t1 ADD COLUMN col2 int, ALGORITHM=COPY"
109+
--source include/wait_condition.inc
110+
111+
--let $rpl_connection_name= server2
112+
--source include/rpl_connection.inc
113+
114+
SET DEBUG_SYNC = 'now SIGNAL gr_clone_continue';
115+
116+
--let $group_replication_member_state=ERROR
117+
--source include/gr_wait_for_member_state.inc
118+
119+
--let $assert_text= Clone should have failed
120+
--let $assert_cond= [SELECT state="Failed" FROM performance_schema.clone_status] = 1;
121+
--source include/assert.inc
122+
123+
# Verify in the log file that recovery cannot be executed
124+
--let $assert_file= $MYSQLTEST_VARDIR/log/mysqld.2.err
125+
--let $assert_only_after = CURRENT_TEST: group_replication.gr_clone_integration_ddl_error
126+
--let $assert_text = Clone failed and no recovery is possible
127+
--let $assert_select = Due to a critical cloning error or lack of donors, distributed recovery cannot be executed.
128+
--let $assert_count = 1
129+
--source include/assert_grep.inc
130+
131+
--echo
132+
--echo ############################################################
133+
--echo # 6. Restart Group replication
134+
--echo # The clone process should now be successful
135+
136+
--source include/stop_group_replication.inc
137+
138+
START GROUP_REPLICATION;
139+
140+
--source include/wait_until_disconnected.inc
141+
142+
--let $rpl_server_number= 2
143+
--source include/rpl_reconnect.inc
144+
145+
--let $group_replication_member_state=ONLINE
146+
--source include/gr_wait_for_member_state.inc
147+
148+
# See if the data has been properly cloned in server2
149+
--let $diff_tables=server1:test.t1 ,server2:test.t1
150+
--source include/diff_tables.inc
151+
152+
--echo
153+
--echo ############################################################
154+
--echo # 7. Cleanup
155+
156+
--replace_result $_group_replication_threshold_save CLONE_THRESHOLD_VALUE
157+
--eval SET GLOBAL group_replication_clone_threshold= $_group_replication_threshold_save
158+
159+
RESET PERSIST IF EXISTS group_replication_group_name;
160+
RESET PERSIST IF EXISTS group_replication_local_address;
161+
RESET PERSIST IF EXISTS group_replication_group_seeds;
162+
RESET PERSIST IF EXISTS group_replication_start_on_boot;
163+
--replace_result $_group_replication_start_on_boot_save START_ON_BOOT_VALUE
164+
--eval SET GLOBAL group_replication_start_on_boot= $_group_replication_start_on_boot_save
165+
166+
--let $rpl_connection_name= server_1_1
167+
--source include/rpl_connection.inc
168+
169+
--reap
170+
171+
DROP TABLE t1;
172+
173+
--source include/rpl_sync.inc
174+
175+
--let $rpl_connection_name= server2
176+
--source include/rpl_connection.inc
177+
178+
--source include/clean_monitoring_process.inc
179+
180+
set session sql_log_bin=0;
181+
call mtr.add_suppression("Clone removing all user data for provisioning: Started");
182+
call mtr.add_suppression("Clone removing all user data for provisioning: Finished");
183+
call mtr.add_suppression("Due to the number of missing transactions being higher than the configured threshold of 1, this member will start distributed recovery using clone.");
184+
call mtr.add_suppression("Internal query: CLONE INSTANCE FROM \'root\'@\'127.0.0.1\':[0-9]+ IDENTIFIED BY \'\\*\\*\\*\\*\\*\' REQUIRE NO SSL; result in error. Error number:*");
185+
call mtr.add_suppression("There was an issue when cloning from another server: Error number: 3862 Error message: Clone Donor Error: 3633 : Concurrent DDL is performed during clone operation. Please try again.");
186+
call mtr.add_suppression("Due to a critical cloning error or lack of donors, distributed recovery cannot be executed. The member will now leave the group.");
187+
call mtr.add_suppression("Skipping leave operation: concurrent attempt to leave the group is on-going.");
188+
set session sql_log_bin=1;
189+
190+
--let $rpl_connection_name= server1
191+
--source include/rpl_connection.inc
192+
193+
UNINSTALL PLUGIN clone;
194+
195+
--let $rpl_connection_name= server2
196+
--source include/rpl_connection.inc
197+
198+
UNINSTALL PLUGIN clone;
199+
200+
--source include/group_replication_end.inc
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
$CLONE_PLUGIN_OPT
2+
$KEYRING_PLUGIN_OPT --loose-keyring-file-data=$MYSQL_TMP_DIR/gr_clone_integration_different_plugins_keyring
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
$CLONE_PLUGIN_OPT

0 commit comments

Comments
 (0)