Skip to content

Commit e7a4cbe

Browse files
author
Venkatesh Duggirala
committed
Bug#17280176 TRANSACTIONS SKIPPED ON SLAVE AFTER
"STOP/START SLAVE" USING GTID REPLICATION Analysis: Slave updates 'GTID_RETRIEVED' set upon receiving GTID_LOG_EVENT for a particular transaction which is first event in the event group. Say, I/O thread is stopped *after* adding GTID number to 'gtid_trieved' set and *before* it actually retrieves all the events from that GTID event group. Next time when this I/O thread is reconnected, it sends union of GTID_RETRIEVED + GTID_EXECUTED set to master. So Master thinks that slave is having all the events from this GTID set(which includes partially retrieved GTID) and it will not resend them again. Hence slave is missing some events for ever. Fix: It is not easy to find the end of a group of events. So mysql server is unsure whether I/O thread retrieved the last gtid transaction events completely or not (before it is going down because of a crash/normal shutdown/normal stop slave io_thread). It is possible that I/O thread would have retrieved and written only partial transaction events. So Server will request Master to send the last gtid event once again. We do this by removing the last I/O thread retrieved gtid event from "Retrieved_gtid_set". Possible cases: 1) I/O thread would have retrieved full transaction already in the first time itself, but retrieving them again will not cause problem because GTID number is same, Hence SQL thread will not commit it again. 2) I/O thread would have retrieved full transaction already and SQL thread would have already executed it. In that case, We are not going remove last retrieved gtid from "Retrieved_gtid_set" otherwise we will see gaps in "Retrieved set".
1 parent 765b265 commit e7a4cbe

8 files changed

Lines changed: 117 additions & 17 deletions

File tree

mysql-test/include/sync_slave_io.inc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@
3737

3838
if ($use_gtids)
3939
{
40-
--let $slave_param= Retrieved_Gtid_set
40+
--let $slave_param= Retrieved_Gtid_Set
4141
--let $slave_param_value= $_saved_gtids
4242
--source include/wait_for_slave_param.inc
4343
}

sql/binlog.cc

Lines changed: 30 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2538,6 +2538,8 @@ bool MYSQL_BIN_LOG::open_index_file(const char *index_file_name_arg,
25382538
this object.
25392539
@param prev_gtids If not NULL, then the GTIDs from the
25402540
Previous_gtids_log_events are stored in this object.
2541+
@param last_gtid If not NULL, then the last GTID information from the
2542+
file will be stored in this object.
25412543
@param verify_checksum Set to true to verify event checksums.
25422544
25432545
@retval GOT_GTIDS The file was successfully read and it contains
@@ -2556,7 +2558,8 @@ enum enum_read_gtids_from_binlog_status
25562558
{ GOT_GTIDS, GOT_PREVIOUS_GTIDS, NO_GTIDS, ERROR, TRUNCATED };
25572559
static enum_read_gtids_from_binlog_status
25582560
read_gtids_from_binlog(const char *filename, Gtid_set *all_gtids,
2559-
Gtid_set *prev_gtids, bool verify_checksum)
2561+
Gtid_set *prev_gtids, Gtid *last_gtid,
2562+
bool verify_checksum)
25602563
{
25612564
DBUG_ENTER("read_gtids_from_binlog");
25622565
DBUG_PRINT("info", ("Opening file %s", filename));
@@ -2664,24 +2667,34 @@ read_gtids_from_binlog(const char *filename, Gtid_set *all_gtids,
26642667
at least one Gtid_log_event, so that we can distinguish the
26652668
return values GOT_GTID and GOT_PREVIOUS_GTIDS. We don't need
26662669
to read anything else from the binary log.
2670+
But if last_gtid is requested (i.e., NOT NULL), we should continue to
2671+
read all gtids. Otherwise, we are done.
26672672
*/
2668-
if (all_gtids == NULL)
2673+
if (all_gtids == NULL && last_gtid == NULL)
2674+
{
26692675
ret= GOT_GTIDS, done= true;
2676+
}
26702677
else
26712678
{
26722679
Gtid_log_event *gtid_ev= (Gtid_log_event *)ev;
26732680
rpl_sidno sidno= gtid_ev->get_sidno(false/*false=don't need lock*/);
26742681
if (sidno < 0)
26752682
ret= ERROR, done= true;
2683+
else
26762684
{
2677-
if (all_gtids->ensure_sidno(sidno) != RETURN_STATUS_OK)
2678-
ret= ERROR, done= true;
2679-
else if (all_gtids->_add_gtid(sidno, gtid_ev->get_gno()) !=
2680-
RETURN_STATUS_OK)
2681-
ret= ERROR, done= true;
2685+
if (all_gtids)
2686+
{
2687+
if (all_gtids->ensure_sidno(sidno) != RETURN_STATUS_OK)
2688+
ret= ERROR, done= true;
2689+
else if (all_gtids->_add_gtid(sidno, gtid_ev->get_gno()) !=
2690+
RETURN_STATUS_OK)
2691+
ret= ERROR, done= true;
2692+
DBUG_PRINT("info", ("Got Gtid from file '%s': Gtid(%d, %lld).",
2693+
filename, sidno, gtid_ev->get_gno()));
2694+
}
2695+
if (last_gtid)
2696+
last_gtid->set(sidno, gtid_ev->get_gno());
26822697
}
2683-
DBUG_PRINT("info", ("Got Gtid from file '%s': Gtid(%d, %lld).",
2684-
filename, sidno, gtid_ev->get_gno()));
26852698
}
26862699
break;
26872700
}
@@ -2778,7 +2791,7 @@ bool MYSQL_BIN_LOG::find_first_log_not_in_gtid_set(char *binlog_file_name,
27782791
const char *filename= rit->c_str();
27792792
DBUG_PRINT("info", ("Read Previous_gtids_log_event from filename='%s'",
27802793
filename));
2781-
switch (read_gtids_from_binlog(filename, NULL, &previous_gtid_set,
2794+
switch (read_gtids_from_binlog(filename, NULL, &previous_gtid_set, NULL,
27822795
opt_master_verify_checksum))
27832796
{
27842797
case ERROR:
@@ -2829,6 +2842,7 @@ bool MYSQL_BIN_LOG::find_first_log_not_in_gtid_set(char *binlog_file_name,
28292842
}
28302843

28312844
bool MYSQL_BIN_LOG::init_gtid_sets(Gtid_set *all_gtids, Gtid_set *lost_gtids,
2845+
Gtid *last_gtid,
28322846
bool verify_checksum, bool need_lock)
28332847
{
28342848
DBUG_ENTER("MYSQL_BIN_LOG::init_gtid_sets");
@@ -2888,15 +2902,17 @@ bool MYSQL_BIN_LOG::init_gtid_sets(Gtid_set *all_gtids, Gtid_set *lost_gtids,
28882902
reached_first_file= (rit == filename_list.rend());
28892903
DBUG_PRINT("info", ("filename='%s' reached_first_file=%d",
28902904
rit->c_str(), reached_first_file));
2891-
while (!got_gtids && !reached_first_file)
2905+
while ((!got_gtids || (last_gtid && last_gtid->empty()))
2906+
&& !reached_first_file)
28922907
{
28932908
const char *filename= rit->c_str();
28942909
rit++;
28952910
reached_first_file= (rit == filename_list.rend());
28962911
DBUG_PRINT("info", ("filename='%s' got_gtids=%d reached_first_file=%d",
28972912
filename, got_gtids, reached_first_file));
2898-
switch (read_gtids_from_binlog(filename, all_gtids,
2913+
switch (read_gtids_from_binlog(filename, got_gtids ? NULL : all_gtids,
28992914
reached_first_file ? lost_gtids : NULL,
2915+
last_gtid,
29002916
verify_checksum))
29012917
{
29022918
case ERROR:
@@ -2919,7 +2935,7 @@ bool MYSQL_BIN_LOG::init_gtid_sets(Gtid_set *all_gtids, Gtid_set *lost_gtids,
29192935
{
29202936
const char *filename= it->c_str();
29212937
DBUG_PRINT("info", ("filename='%s'", filename));
2922-
switch (read_gtids_from_binlog(filename, NULL, lost_gtids,
2938+
switch (read_gtids_from_binlog(filename, NULL, lost_gtids, NULL,
29232939
verify_checksum))
29242940
{
29252941
case ERROR:
@@ -4130,6 +4146,7 @@ int MYSQL_BIN_LOG::purge_logs(const char *to_log,
41304146
global_sid_lock->wrlock();
41314147
error= init_gtid_sets(NULL,
41324148
const_cast<Gtid_set *>(gtid_state->get_lost_gtids()),
4149+
NULL,
41334150
opt_master_verify_checksum,
41344151
false/*false=don't need lock*/);
41354152
global_sid_lock->unlock();

sql/binlog.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -482,14 +482,17 @@ class MYSQL_BIN_LOG: public TC_LOG, private MYSQL_LOG
482482
@param lost_groups Will be filled with all GTIDs in the
483483
Previous_gtids_log_event of the first binary log that has a
484484
Previous_gtids_log_event.
485+
@param last_gtid Will be filled with the last availble GTID information
486+
in the binary/relay log files.
485487
@param verify_checksum If true, checksums will be checked.
486488
@param need_lock If true, LOCK_log, LOCK_index, and
487489
global_sid_lock->wrlock are acquired; otherwise they are asserted
488490
to be taken already.
489491
@return false on success, true on error.
490492
*/
491493
bool init_gtid_sets(Gtid_set *gtid_set, Gtid_set *lost_groups,
492-
bool verify_checksum, bool need_lock);
494+
Gtid *last_gtid, bool verify_checksum,
495+
bool need_lock);
493496

494497
void set_previous_gtid_set(Gtid_set *previous_gtid_set_param)
495498
{

sql/mysqld.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5461,6 +5461,7 @@ int mysqld_main(int argc, char **argv)
54615461
if (mysql_bin_log.init_gtid_sets(
54625462
const_cast<Gtid_set *>(gtid_state->get_logged_gtids()),
54635463
const_cast<Gtid_set *>(gtid_state->get_lost_gtids()),
5464+
NULL,
54645465
opt_master_verify_checksum,
54655466
true/*true=need lock*/))
54665467
unireg_abort(1);

sql/rpl_gtid.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -760,6 +760,10 @@ struct Gtid
760760

761761
/// Set both components to 0.
762762
void clear() { sidno= 0; gno= 0; }
763+
// Set both components to input values.
764+
void set(rpl_sidno sno, rpl_gno gtidno) { sidno= sno; gno= gtidno; }
765+
// check if both components are zero or not.
766+
bool empty() const { return (sidno == 0) && (gno == 0); }
763767
/**
764768
The maximal length of the textual representation of a SID, not
765769
including the terminating '\0'.
@@ -929,6 +933,16 @@ class Gtid_set
929933
*/
930934
enum_return_status _add_gtid(const Gtid &gtid)
931935
{ return _add_gtid(gtid.sidno, gtid.gno); }
936+
/**
937+
Removes the given GTID from this Gtid_set.
938+
939+
@param gtid Gtid to remove.
940+
@return RETURN_STATUS_OK or RETURN_STATUS_REPORTED_ERROR.
941+
*/
942+
enum_return_status _remove_gtid(const Gtid &gtid)
943+
{
944+
return _remove_gtid(gtid.sidno, gtid.gno);
945+
}
932946
/**
933947
Adds all groups from the given Gtid_set to this Gtid_set.
934948

sql/rpl_rli.cc

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,7 @@ Relay_log_info::Relay_log_info(bool is_slave_recovery
136136

137137
relay_log.init_pthread_objects();
138138
do_server_version_split(::server_version, slave_version_split);
139-
139+
last_retrieved_gtid.clear();
140140
DBUG_VOID_RETURN;
141141
}
142142

@@ -176,6 +176,7 @@ Relay_log_info::~Relay_log_info()
176176
my_atomic_rwlock_destroy(&slave_open_temp_tables_lock);
177177
relay_log.cleanup();
178178
set_rli_description_event(NULL);
179+
last_retrieved_gtid.clear();
179180

180181
DBUG_VOID_RETURN;
181182
}
@@ -1760,8 +1761,16 @@ a file name for --relay-log-index option.", opt_relaylog_index_name);
17601761
gtid_set.dbug_print("set of GTIDs in relay log before initialization");
17611762
global_sid_lock->unlock();
17621763
#endif
1764+
/*
1765+
Below init_gtid_sets() function will parse the available relay logs and
1766+
set I/O retrieved gtid event in gtid_state object. We dont need to find
1767+
last_retrieved_gtid_event if relay_log_recovery=1 (retrieved set will
1768+
be cleared off in that case).
1769+
*/
1770+
Gtid *last_retrieved_gtid= is_relay_log_recovery ? NULL : get_last_retrieved_gtid();
17631771
if (!current_thd &&
17641772
relay_log.init_gtid_sets(&gtid_set, NULL,
1773+
last_retrieved_gtid,
17651774
opt_slave_sql_verify_checksum,
17661775
true/*true=need lock*/))
17671776
{

sql/rpl_rli.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -240,8 +240,12 @@ class Relay_log_info : public Rpl_info
240240

241241
private:
242242
Gtid_set gtid_set;
243+
/* Last gtid retrieved by IO thread */
244+
Gtid last_retrieved_gtid;
243245

244246
public:
247+
Gtid *get_last_retrieved_gtid() { return &last_retrieved_gtid; }
248+
void set_last_retrieved_gtid(Gtid gtid) { last_retrieved_gtid= gtid; }
245249
int add_logged_gtid(rpl_sidno sidno, rpl_gno gno)
246250
{
247251
int ret= 0;

sql/rpl_slave.cc

Lines changed: 53 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3087,6 +3087,45 @@ static int request_dump(THD *thd, MYSQL* mysql, Master_info* mi,
30873087
Gtid_set gtid_executed(&sid_map);
30883088
global_sid_lock->wrlock();
30893089
gtid_state->dbug_print();
3090+
3091+
/*
3092+
We are unsure whether I/O thread retrieved the last gtid transaction
3093+
completely or not (before it is going down because of a crash/normal
3094+
shutdown/normal stop slave io_thread). It is possible that I/O thread
3095+
would have retrieved and written only partial transaction events. So We
3096+
request Master to send the last gtid event once again. We do this by
3097+
removing the last I/O thread retrieved gtid event from
3098+
"Retrieved_gtid_set". Possible cases: 1) I/O thread would have
3099+
retrieved full transaction already in the first time itself, but
3100+
retrieving them again will not cause problem because GTID number is
3101+
same, Hence SQL thread will not commit it again. 2) I/O thread would
3102+
have retrieved full transaction already and SQL thread would have
3103+
already executed it. In that case, We are not going remove last
3104+
retrieved gtid from "Retrieved_gtid_set" otherwise we will see gaps in
3105+
"Retrieved set". The same case is handled in the below code. Please
3106+
note there will be paritial transactions written in relay log but they
3107+
will not cause any problem incase of transactional tables. But incase
3108+
of non-transaction tables, partial trx will create inconsistency
3109+
between master and slave. In that case, users need to check manually.
3110+
*/
3111+
3112+
Gtid_set * retrieved_set= (const_cast<Gtid_set *>(mi->rli->get_gtid_set()));
3113+
Gtid *last_retrieved_gtid= mi->rli->get_last_retrieved_gtid();
3114+
3115+
/*
3116+
Remove last_retrieved_gtid only if it is not part of
3117+
executed_gtid_set
3118+
*/
3119+
if (!last_retrieved_gtid->empty() &&
3120+
!gtid_state->get_logged_gtids()->contains_gtid(*last_retrieved_gtid))
3121+
{
3122+
if (retrieved_set->_remove_gtid(*last_retrieved_gtid) != RETURN_STATUS_OK)
3123+
{
3124+
global_sid_lock->unlock();
3125+
goto err;
3126+
}
3127+
}
3128+
30903129
if (gtid_executed.add_gtid_set(mi->rli->get_gtid_set()) != RETURN_STATUS_OK ||
30913130
gtid_executed.add_gtid_set(gtid_state->get_logged_gtids()) !=
30923131
RETURN_STATUS_OK)
@@ -4359,7 +4398,6 @@ Stopping slave I/O thread due to out-of-memory error from master");
43594398
"could not queue event from master");
43604399
goto err;
43614400
}
4362-
43634401
if (RUN_HOOK(binlog_relay_io, after_queue_event,
43644402
(thd, mi, event_buf, event_len, synced)))
43654403
{
@@ -4412,6 +4450,18 @@ ignore_log_space_limit=%d",
44124450
log space");
44134451
goto err;
44144452
}
4453+
DBUG_EXECUTE_IF("stop_io_after_reading_gtid_log_event",
4454+
if (event_buf[EVENT_TYPE_OFFSET] == GTID_LOG_EVENT)
4455+
thd->killed= THD::KILLED_NO_VALUE;
4456+
);
4457+
DBUG_EXECUTE_IF("stop_io_after_reading_query_log_event",
4458+
if (event_buf[EVENT_TYPE_OFFSET] == QUERY_EVENT)
4459+
thd->killed= THD::KILLED_NO_VALUE;
4460+
);
4461+
DBUG_EXECUTE_IF("stop_io_after_reading_xid_log_event",
4462+
if (event_buf[EVENT_TYPE_OFFSET] == XID_EVENT)
4463+
thd->killed= THD::KILLED_NO_VALUE;
4464+
);
44154465
}
44164466
}
44174467

@@ -6660,6 +6710,8 @@ static int queue_event(Master_info* mi,const char* buf, ulong event_len)
66606710
{
66616711
global_sid_lock->rdlock();
66626712
int ret= rli->add_logged_gtid(gtid.sidno, gtid.gno);
6713+
if (!ret)
6714+
rli->set_last_retrieved_gtid(gtid);
66636715
global_sid_lock->unlock();
66646716
if (ret != 0)
66656717
goto err;

0 commit comments

Comments
 (0)