Skip to content

Commit 6e6add6

Browse files
author
Venkatesh Duggirala
committed
Bug #18789758 DATA INCONSISTENCIES WHEN MASTER HAS TRUNCATED
BINARY LOG WITH GTID AFTER CRASH Problem: Master's dump thread is not detecting the case where Slave's gtid executed set is having more gtids than Master's gtid executed set with respective to Master's UUID. Analysis & Fix: In normal scenarios, it is not possible that Slave will contain more gtids than Master with respective to Master's UUID. But it could be possible case if Master's binary log is truncated(due to raid failure) or Master's binary log is deleted but GTID_PURGED was not set properly. That scenario needs to be validated, i.e., it should *always* be the case that Slave's gtid executed set (+retrieved set) is a subset of Master's gtid executed set with respective to Master's UUID. If it happens, Master's dump thread will be stopped and this situation will be informed to Slave during the handshake (thus. slave I/O thread also be stopped with an error (ER_MASTER_FATAL_ERROR_READING_BINLOG). Otherwise, it can lead to data inconsistency between Master and Slave.
1 parent 483dccb commit 6e6add6

12 files changed

Lines changed: 197 additions & 13 deletions

mysql-test/suite/rpl/r/rpl_check_gtid.result

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,7 @@ FLUSH LOGS;
229229
include/stop_slave.inc
230230
RESET SLAVE;
231231
RESET MASTER;
232+
RESET MASTER;
232233
include/start_slave.inc
233234
BEGIN;
234235
INSERT INTO t1 VALUES(1);
@@ -246,6 +247,7 @@ FLUSH LOGS;
246247
include/stop_slave.inc
247248
RESET SLAVE;
248249
RESET MASTER;
250+
RESET MASTER;
249251
include/start_slave.inc
250252
BEGIN;
251253
INSERT INTO t1 VALUES(1);
@@ -263,6 +265,7 @@ FLUSH LOGS;
263265
include/stop_slave.inc
264266
RESET SLAVE;
265267
RESET MASTER;
268+
RESET MASTER;
266269
include/start_slave.inc
267270
BEGIN;
268271
INSERT INTO t1 VALUES(1);
@@ -280,6 +283,7 @@ FLUSH LOGS;
280283
include/stop_slave.inc
281284
RESET SLAVE;
282285
RESET MASTER;
286+
RESET MASTER;
283287
include/start_slave.inc
284288
BEGIN;
285289
INSERT INTO t1 VALUES(1);

mysql-test/suite/rpl/r/rpl_gtid_validate_slave_gtids.result

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,26 @@ Warnings:
33
Note #### Sending passwords in plain text without SSL/TLS is extremely insecure.
44
Note #### Storing MySQL user name or password information in the master info repository is not secure and is therefore not recommended. Please consider using the USER and PASSWORD connection options for START SLAVE; see the 'START SLAVE Syntax' in the MySQL Manual for more information.
55
[connection master]
6-
SET GLOBAL GTID_PURGED ="master_uuid:1-2";
6+
SET GLOBAL GTID_PURGED= "master_uuid:1-2";
77
CREATE TABLE t1(i INT);
88
DROP TABLE t1;
99
START SLAVE IO_THREAD;
1010
include/wait_for_slave_io_error.inc [errno=1236]
1111
Last_IO_Error = 'Got fatal error 1236 from master when reading data from binary log: 'The slave is connecting using CHANGE MASTER TO MASTER_AUTO_POSITION = 1, but the master has purged binary logs containing GTIDs that the slave requires.''
12-
SET GLOBAL GTID_PURGED ="master_uuid:1-2";
12+
SET GLOBAL GTID_PURGED= "master_uuid:1-2";
1313
include/start_slave.inc
1414
include/sync_slave_sql_with_master.inc
1515
include/assert.inc [Slave should be able to get GTID-3 and 4 now.]
1616
call mtr.add_suppression(".*Master has purged binary logs containing GTIDs that the slave requires.*");
17+
include/rpl_reset.inc
18+
CREATE TABLE t1(i INT);
19+
DROP TABLE t1;
20+
include/stop_slave.inc
21+
include/rpl_restart_server.inc [server_number=1]
22+
START SLAVE IO_THREAD;
23+
include/wait_for_slave_io_error.inc [errno=1236]
24+
Last_IO_Error = 'Got fatal error 1236 from master when reading data from binary log: 'Slave has more GTIDs than the master has, using the master's SERVER_UUID. This may indicate that the end of the binary log was truncated or that the last binary log file was lost, e.g., after a power or disk failure when sync_binlog != 1. The master may or may not have rolled back transactions that were already replica''
25+
call mtr.add_suppression(".*Slave has more GTIDs than the master has.*");
26+
SET GLOBAL GTID_PURGED= "master_uuid:1-2";
27+
include/start_slave.inc
1728
include/rpl_end.inc

mysql-test/suite/rpl/r/rpl_kill_query.result

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ include/assert.inc [Nothing should be inserted on slave]
4545
---- Clean up ----
4646
include/stop_slave_io.inc
4747
RESET SLAVE;
48+
RESET MASTER;
4849
DROP DATABASE db1;
4950
[connection master]
5051
RESET MASTER;

mysql-test/suite/rpl/r/rpl_packet.result

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ Last_IO_Error = 'Got a packet bigger than 'slave_max_allowed_packet' bytes'
5050
STOP SLAVE;
5151
RESET SLAVE;
5252
RESET MASTER;
53+
RESET MASTER;
5354
SET @max_allowed_packet_0= @@session.max_allowed_packet;
5455
SHOW BINLOG EVENTS;
5556
SET @max_allowed_packet_1= @@session.max_allowed_packet;

mysql-test/suite/rpl/t/rpl_check_gtid.test

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -504,6 +504,7 @@ while ($i != 5)
504504
--connection slave
505505
--source include/stop_slave.inc
506506
RESET SLAVE;
507+
RESET MASTER;
507508
--connection master
508509
RESET MASTER;
509510
--connection slave
@@ -525,14 +526,13 @@ while ($i != 5)
525526

526527
# 5.3 - Makes the slave to retrieve and apply these transactions.
527528
--source include/sync_slave_sql_with_master.inc
528-
529529
# Check property 1.2, 2.1, 3.1 and 3.2
530530
--let $binlog= binlog
531531
--let $server_uuid= $master_uuid
532-
--let $gtid_set_ini= 1
533-
--let $gtid_set_end= 4
534-
--let $gtid_ini=
535-
--let $gtid_end=
532+
--let $gtid_set_ini=
533+
--let $gtid_set_end=
534+
--let $gtid_ini= 1
535+
--let $gtid_end= 2
536536
--source extra/rpl_tests/rpl_check_gtid.inc
537537

538538
# 5.4 - Rotates both the binary logs on the slave and master.

mysql-test/suite/rpl/t/rpl_gtid_validate_slave_gtids.test

Lines changed: 71 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
# Step-1) Master wants to set 1 and 2 GTIDs in GTID_PURGED
2323
--let $master_uuid= `SELECT @@GLOBAL.SERVER_UUID`
2424
--replace_result $master_uuid master_uuid
25-
--eval SET GLOBAL GTID_PURGED ="$master_uuid:1-2"
25+
--eval SET GLOBAL GTID_PURGED= "$master_uuid:1-2"
2626

2727
# Step-2) Do some dummy transactions
2828
CREATE TABLE t1(i INT); # GTID-3
@@ -42,7 +42,7 @@ START SLAVE IO_THREAD;
4242

4343
# Fix the problem by setting GTID_PURGED on Slave as well.
4444
--replace_result $master_uuid master_uuid
45-
--eval SET GLOBAL GTID_PURGED ="$master_uuid:1-2"
45+
--eval SET GLOBAL GTID_PURGED= "$master_uuid:1-2"
4646

4747
# Now start slave threads. Slave will ask Master to send all GTIDs
4848
# except GTID:1-2 which should not be a problem and should not throw
@@ -56,4 +56,73 @@ START SLAVE IO_THREAD;
5656
--source include/assert.inc
5757

5858
call mtr.add_suppression(".*Master has purged binary logs containing GTIDs that the slave requires.*");
59+
60+
###############################################################################
61+
#Bug #18789758 DATA INCONSISTENCIES WHEN MASTER HAS TRUNCATED BINARY LOG WITH
62+
# GTID AFTER CRASH
63+
# Problem:
64+
# Master's dump thread is not detecting the case where Slave's gtid executed
65+
# set is having more gtids than Master's gtid executed set with respective
66+
# to Master's UUID.
67+
# Fix:
68+
# If it happens, dump thread will be stopped during the handshake
69+
# with Slave(thus the Slave's I/O thread will be stopped with the
70+
# error). Otherwise, it can lead to data inconsistency
71+
# between Master and Slave.
72+
#
73+
# Steps to reproduce:
74+
# 1) Execute some sample gtid transactions on Master
75+
# 2) Let it reach Slave
76+
# 3) Fake raid failure( by manually deleting binary log file)
77+
# 4) Restart Master, thus removing those gtids from executed gtid set.
78+
# 5) Restart slave's I/O thread, Slave sends its gtids which are purged on
79+
# Master
80+
# 6) Make sure I/O thread gets error and informs the situation to the Slave.
81+
# 7) Verify that situation is recovered back normal after setting GTID_PURGED
82+
# value to those purged gtids.
83+
###############################################################################
84+
85+
# Cleanup from the next test script
86+
--connection master
87+
--source include/rpl_reset.inc
88+
89+
# Step-1) Execute some sample gtid transactions on Master
90+
CREATE TABLE t1(i INT);
91+
DROP TABLE t1;
92+
93+
# Step-2) Let it reach Slave.
94+
--sync_slave_with_master
95+
--source include/stop_slave.inc
96+
97+
# Step-3) Fake raid failure( by manually deleting binary log file)
98+
--connection master
99+
--let $master_datadir= `SELECT @@datadir;`
100+
--remove_file $master_datadir/master-bin.000001
101+
--remove_file $master_datadir/master-bin.index
102+
103+
# Step-4) Restart Master, thus removing those gtids from executed gtid set.
104+
--let $rpl_server_number= 1
105+
--source include/rpl_restart_server.inc
106+
107+
# Step-5) Restart slave's I/O thread, Slave sends its gtids which are purged on Master
108+
--connection slave
109+
START SLAVE IO_THREAD;
110+
111+
# Step-6) Make sure I/O thread gets error and informs the situation to the Slave.
112+
--let $slave_io_errno= convert_error(ER_MASTER_FATAL_ERROR_READING_BINLOG)
113+
--let $show_slave_io_error= 1
114+
--source include/wait_for_slave_io_error.inc
115+
call mtr.add_suppression(".*Slave has more GTIDs than the master has.*");
116+
117+
# Step-7) Verify that situation is recovered back normal after setting GTID_PURGED
118+
# value to those purged gtids.
119+
--connection master
120+
--let $master_uuid= `SELECT @@GLOBAL.SERVER_UUID`
121+
--replace_result $master_uuid master_uuid
122+
--eval SET GLOBAL GTID_PURGED= "$master_uuid:1-2"
123+
124+
--connection slave
125+
--source include/start_slave.inc
126+
127+
# Cleanup
59128
--source include/rpl_end.inc

mysql-test/suite/rpl/t/rpl_kill_query.test

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ CALL mtr.add_suppression("The slave coordinator and worker threads are stopped,
6363
--echo ---- Clean up ----
6464
--source include/stop_slave_io.inc
6565
RESET SLAVE;
66+
RESET MASTER;
6667
DROP DATABASE db1;
6768
--source include/rpl_connection_master.inc
6869
RESET MASTER;

mysql-test/suite/rpl/t/rpl_packet.test

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,7 @@ connection slave;
151151
# Remove the bad binlog and clear error status on slave.
152152
STOP SLAVE;
153153
RESET SLAVE;
154+
RESET MASTER;
154155
--connection master
155156
RESET MASTER;
156157

sql/rpl_gtid.h

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
/* Copyright (c) 2011, 2013, Oracle and/or its affiliates. All rights reserved.
1+
/* Copyright (c) 2011, 2014, Oracle and/or its affiliates. All rights reserved.
22
33
This program is free software; you can redistribute it and/or
44
modify it under the terms of the GNU General Public License as
@@ -1032,6 +1032,24 @@ class Gtid_set
10321032
enum_return_status ensure_sidno(rpl_sidno sidno);
10331033
/// Returns true if this Gtid_set is a subset of the other Gtid_set.
10341034
bool is_subset(const Gtid_set *super) const;
1035+
1036+
/**
1037+
Returns true if this Gtid_set is a subset of the given gtid_set
1038+
on the given superset_sidno and subset_sidno.
1039+
1040+
@param super Gtid_set with which 'this'::gtid_set needs to be
1041+
compared
1042+
@param superset_sidno The sidno that will be compared, relative to
1043+
super->sid_map.
1044+
@param subset_sidno The sidno that will be compared, relative to
1045+
this->sid_map.
1046+
@return true If 'this' Gtid_set is subset of given
1047+
'super' Gtid_set.
1048+
false If 'this' Gtid_set is *not* subset of given
1049+
'super' Gtid_set.
1050+
*/
1051+
bool is_subset_for_sid(const Gtid_set *super, rpl_sidno superset_sidno,
1052+
rpl_sidno subset_sidno) const;
10351053
/// Returns true if there is a least one element of this Gtid_set in
10361054
/// the other Gtid_set.
10371055
bool is_intersection_nonempty(const Gtid_set *other) const;
@@ -2266,6 +2284,11 @@ class Gtid_state
22662284
const Owned_gtids *get_owned_gtids() const { return &owned_gtids; }
22672285
/// Return the server's SID's SIDNO
22682286
rpl_sidno get_server_sidno() const { return server_sidno; }
2287+
/// Return the server's SID
2288+
const rpl_sid &get_server_sid() const
2289+
{
2290+
return global_sid_map->sidno_to_sid(server_sidno);
2291+
}
22692292
#ifndef DBUG_OFF
22702293
/**
22712294
Debug only: Returns an upper bound on the length of the string

sql/rpl_gtid_set.cc

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
/* Copyright (c) 2011, 2013, Oracle and/or its affiliates. All rights reserved.
1+
/* Copyright (c) 2011, 2014, Oracle and/or its affiliates. All rights reserved.
22
33
This program is free software; you can redistribute it and/or
44
modify it under the terms of the GNU General Public License as
@@ -1053,6 +1053,49 @@ bool Gtid_set::is_interval_subset(Const_interval_iterator *sub,
10531053
DBUG_RETURN(true);
10541054
}
10551055

1056+
bool Gtid_set::is_subset_for_sid(const Gtid_set *super,
1057+
rpl_sidno superset_sidno,
1058+
rpl_sidno subset_sidno) const
1059+
{
1060+
DBUG_ENTER("Gtid_set::is_subset_for_sidno");
1061+
/*
1062+
The following assert code is to see that caller acquired
1063+
either write or read lock on global_sid_lock.
1064+
Note that if it is read lock, then it should also
1065+
acquire lock on sidno.
1066+
i.e., the caller must acquire lock either A1 way or A2 way.
1067+
A1. global_sid_lock.wrlock()
1068+
A2. global_sid_lock.rdlock(); gtid_state.lock_sidno(sidno)
1069+
*/
1070+
if (sid_lock != NULL)
1071+
super->sid_lock->assert_some_lock();
1072+
if (super->sid_lock != NULL)
1073+
super->sid_lock->assert_some_lock();
1074+
/*
1075+
If subset(i.e, this object) does not have required sid in it, i.e.,
1076+
subset_sidno is zero, then it means it is subset of any given
1077+
super set. Hence return true.
1078+
*/
1079+
if (subset_sidno == 0)
1080+
DBUG_RETURN(true);
1081+
/*
1082+
If superset (i.e., the passed gtid_set) does not have given sid in it,
1083+
i.e., superset_sidno is zero, then it means it cannot be superset
1084+
to any given subset. Hence return false.
1085+
*/
1086+
if (superset_sidno == 0)
1087+
DBUG_RETURN(false);
1088+
/*
1089+
Once we have valid(non-zero) subset's and superset's sid numbers, call
1090+
is_interval_subset().
1091+
*/
1092+
Const_interval_iterator subset_ivit(this, subset_sidno);
1093+
Const_interval_iterator superset_ivit(super, superset_sidno);
1094+
if (!is_interval_subset(&subset_ivit, &superset_ivit))
1095+
DBUG_RETURN(false);
1096+
1097+
DBUG_RETURN(true);
1098+
}
10561099

10571100
bool Gtid_set::is_subset(const Gtid_set *super) const
10581101
{

0 commit comments

Comments
 (0)