diff options
Diffstat (limited to 'sql/ha_innodb.cc')
-rw-r--r-- | sql/ha_innodb.cc | 287 |
1 files changed, 281 insertions, 6 deletions
diff --git a/sql/ha_innodb.cc b/sql/ha_innodb.cc index 3f592e36219..e68a85bdac9 100644 --- a/sql/ha_innodb.cc +++ b/sql/ha_innodb.cc @@ -45,10 +45,58 @@ have disables the InnoDB inlining in this file. */ #include "ha_innodb.h" -pthread_mutex_t innobase_share_mutex, // to protect innobase_open_files - prepare_commit_mutex; // to force correct commit order in binlog +pthread_mutex_t innobase_share_mutex, /* to protect innobase_open_files */ + prepare_commit_mutex; /* to force correct commit order in + binlog */ bool innodb_inited= 0; +/*-----------------------------------------------------------------*/ +/* These variables are used to implement (semi-)synchronous MySQL binlog +replication for InnoDB tables. */ + +pthread_cond_t innobase_repl_cond; /* Posix cond variable; + this variable is signaled + when enough binlog has been + sent to slave, so that a + waiting trx can return the + 'ok' message to the client + for a commit */ +pthread_mutex_t innobase_repl_cond_mutex; /* Posix cond variable mutex + that also protects the next + innobase_repl_... variables */ +uint innobase_repl_state; /* 1 if synchronous replication + is switched on and is working + ok; else 0 */ +uint innobase_repl_file_name_inited = 0; /* This is set to 1 when + innobase_repl_file_name + contains meaningful data */ +char* innobase_repl_file_name; /* The binlog name up to which + we have sent some binlog to + the slave */ +my_off_t innobase_repl_pos; /* The position in that file + up to which we have sent the + binlog to the slave */ +uint innobase_repl_n_wait_threads = 0; /* This tells how many + transactions currently are + waiting for the binlog to be + sent to the client */ +uint innobase_repl_wait_file_name_inited = 0; /* This is set to 1 + when we know the 'smallest' + wait position */ +char* innobase_repl_wait_file_name; /* NULL, or the 'smallest' + innobase_repl_file_name that + a transaction is waiting for */ +my_off_t innobase_repl_wait_pos; /* The smallest position in + that file that a trx is + waiting for: the trx can + proceed and send an 'ok' to + the client when MySQL has sent + the binlog up to this position + to the slave */ +/*-----------------------------------------------------------------*/ + + + /* Store MySQL definition of 'byte': in Linux it is char while InnoDB uses unsigned char; the header univ.i which we include next defines 'byte' as a macro which expands to 'unsigned char' */ @@ -97,7 +145,7 @@ long innobase_mirrored_log_groups, innobase_log_files_in_group, innobase_log_file_size, innobase_log_buffer_size, innobase_buffer_pool_awe_mem_mb, innobase_buffer_pool_size, innobase_additional_mem_pool_size, - innobase_file_io_threads, innobase_lock_wait_timeout, + innobase_file_io_threads, innobase_lock_wait_timeout, innobase_thread_concurrency, innobase_force_recovery, innobase_open_files; @@ -1531,10 +1579,10 @@ innobase_commit( DBUG_RETURN(0); } -/* The following defined-out code will be enabled later when we put the +/* TODO: put the MySQL-4.1 functionality back to 5.0. This is needed to get InnoDB Hot Backup to work. */ -#if 0 + /********************************************************************* This is called when MySQL writes the binlog entry for the current transaction. Writes to the InnoDB tablespace info which tells where the @@ -1563,6 +1611,24 @@ innobase_report_binlog_offset_and_commit( trx->mysql_log_file_name = log_file_name; trx->mysql_log_offset = (ib_longlong)end_offset; + if (thd->variables.sync_replication) { + /* Let us store the binlog file name and the position, so that + we know how long to wait for the binlog to the replicated to + the slave in synchronous replication. */ + + if (trx->repl_wait_binlog_name == NULL) { + + trx->repl_wait_binlog_name = + (char*)mem_alloc(FN_REFLEN + 100); + } + + ut_a(strlen(log_file_name) <= FN_REFLEN + 100); + + strcpy(trx->repl_wait_binlog_name, log_file_name); + + trx->repl_wait_binlog_pos = (ib_longlong)end_offset; + } + trx->flush_log_later = TRUE; innobase_commit(thd, trx_handle); @@ -1572,6 +1638,7 @@ innobase_report_binlog_offset_and_commit( return(0); } +#if 0 /*********************************************************************** This function stores the binlog offset and flushes logs. */ @@ -1602,7 +1669,6 @@ innobase_store_binlog_offset_and_flush_log( /* Syncronous flush of the log buffer to disk */ log_buffer_flush_to_disk(); } - #endif /********************************************************************* @@ -1615,7 +1681,10 @@ innobase_commit_complete( /* out: 0 */ THD* thd) /* in: user thread */ { + struct timespec abstime; trx_t* trx; + int cmp; + int ret; trx = (trx_t*) thd->ha_data[innobase_hton.slot]; @@ -1631,10 +1700,216 @@ innobase_commit_complete( trx_commit_complete_for_mysql(trx); } + printf("Wait binlog name %s, repl state %lu\n", + trx->repl_wait_binlog_name, + (uint)innobase_repl_state); + + if (thd->variables.sync_replication + && trx->repl_wait_binlog_name + && innobase_repl_state != 0) { + + /* In synchronous replication, let us wait until the MySQL + replication has sent the relevant binlog segment to the + replication slave. */ + +/* TODO: Make sure MySQL uses some way (TCP_NODELAY?) to ensure that the data +has been received in the slave! */ + + pthread_mutex_lock(&innobase_repl_cond_mutex); +try_again: + if (innobase_repl_state == 0) { + + pthread_mutex_unlock(&innobase_repl_cond_mutex); + + return(0); + } + + cmp = strcmp(innobase_repl_file_name, + trx->repl_wait_binlog_name); + if (cmp > 0 + || (cmp == 0 && innobase_repl_pos + >= (my_off_t)trx->repl_wait_binlog_pos)) { + /* We have already sent the relevant binlog to the + slave: no need to wait here */ + + pthread_mutex_unlock(&innobase_repl_cond_mutex); + +/* printf("Binlog now sent\n"); */ + + return(0); + } + + /* Let us update the info about the minimum binlog position + of waiting threads in the innobase_repl_... variables */ + + if (innobase_repl_wait_file_name_inited != 0) { + cmp = strcmp(trx->repl_wait_binlog_name, + innobase_repl_wait_file_name); + if (cmp < 0 + || (cmp == 0 && (my_off_t)trx->repl_wait_binlog_pos + <= innobase_repl_wait_pos)) { + /* This thd has an even lower position, let + us update the minimum info */ + + strcpy(innobase_repl_wait_file_name, + trx->repl_wait_binlog_name); + + innobase_repl_wait_pos = + trx->repl_wait_binlog_pos; + } + } else { + strcpy(innobase_repl_wait_file_name, + trx->repl_wait_binlog_name); + + innobase_repl_wait_pos = trx->repl_wait_binlog_pos; + + innobase_repl_wait_file_name_inited = 1; + } + set_timespec(abstime, thd->variables.sync_replication_timeout); + + /* Let us suspend this thread to wait on the condition; + when replication has progressed far enough, we will release + these waiting threads. The following call + pthread_cond_timedwait also atomically unlocks + innobase_repl_cond_mutex. */ + + innobase_repl_n_wait_threads++; + +/* printf("Waiting for binlog to be sent\n"); */ + + ret = pthread_cond_timedwait(&innobase_repl_cond, + &innobase_repl_cond_mutex, &abstime); + innobase_repl_n_wait_threads--; + + if (ret != 0) { + ut_print_timestamp(stderr); + + fprintf(stderr, +" InnoDB: Error: MySQL synchronous replication\n" +"InnoDB: was not able to send the binlog to the slave within the\n" +"InnoDB: timeout %lu. We assume that the slave has become inaccessible,\n" +"InnoDB: and switch off synchronous replication until the communication.\n" +"InnoDB: to the slave works again.\n", + thd->variables.sync_replication_timeout); + fprintf(stderr, +"InnoDB: MySQL synchronous replication has sent binlog\n" +"InnoDB: to the slave up to file %s, position %lu\n", innobase_repl_file_name, + (ulong)innobase_repl_pos); + fprintf(stderr, +"InnoDB: This transaction needs it to be sent up to\n" +"InnoDB: file %s, position %lu\n", trx->repl_wait_binlog_name, + (uint)trx->repl_wait_binlog_pos); + + innobase_repl_state = 0; + + pthread_mutex_unlock(&innobase_repl_cond_mutex); + + return(0); + } + + goto try_again; + } + return(0); } /********************************************************************* +In synchronous replication, reports to InnoDB up to which binlog position +we have sent the binlog to the slave. Note that replication is synchronous +for one slave only. For other slaves, we do nothing in this function. This +function is used in a replication master. */ + +int +innobase_repl_report_sent_binlog( +/*=============================*/ + /* out: 0 */ + THD* thd, /* in: thread doing the binlog communication to + the slave */ + char* log_file_name, /* in: binlog file name */ + my_off_t end_offset) /* in: the offset in the binlog file up to + which we sent the contents to the slave */ +{ + int cmp; + ibool can_release_threads = 0; + + /* If synchronous replication is not switched on, or this thd is + sending binlog to a slave where we do not need synchronous replication, + then return immediately */ + + if (thd->server_id != thd->variables.sync_replication_slave_id) { + + /* Do nothing */ + + return(0); + } + + pthread_mutex_lock(&innobase_repl_cond_mutex); + + if (innobase_repl_state == 0) { + + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Switching MySQL synchronous replication on again at\n" +"InnoDB: binlog file %s, position %lu\n", log_file_name, (ulong)end_offset); + + innobase_repl_state = 1; + } + + /* The position should increase monotonically, since just one thread + is sending the binlog to the slave for which we want synchronous + replication. Let us check this, and print an error to the .err log + if that is not the case. */ + + if (innobase_repl_file_name_inited) { + cmp = strcmp(log_file_name, innobase_repl_file_name); + + if (cmp < 0 + || (cmp == 0 && end_offset < innobase_repl_pos)) { + + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Error: MySQL synchronous replication has sent binlog\n" +"InnoDB: to the slave up to file %s, position %lu\n", innobase_repl_file_name, + (ulong)innobase_repl_pos); + fprintf(stderr, +"InnoDB: but now MySQL reports that it sent the binlog only up to\n" +"InnoDB: file %s, position %lu\n", log_file_name, (ulong)end_offset); + + } + } + + strcpy(innobase_repl_file_name, log_file_name); + innobase_repl_pos = end_offset; + innobase_repl_file_name_inited = 1; + + if (innobase_repl_n_wait_threads > 0) { + /* Let us check if some of the waiting threads doing a trx + commit can now proceed */ + + cmp = strcmp(innobase_repl_file_name, + innobase_repl_wait_file_name); + if (cmp > 0 + || (cmp == 0 && innobase_repl_pos + >= innobase_repl_wait_pos)) { + + /* Yes, at least one waiting thread can now proceed: + let us release all waiting threads with a broadcast */ + + can_release_threads = 1; + + innobase_repl_wait_file_name_inited = 0; + } + } + + pthread_mutex_unlock(&innobase_repl_cond_mutex); + + if (can_release_threads) { + + pthread_cond_broadcast(&innobase_repl_cond); + } +} + +/********************************************************************* Rolls back a transaction or the latest SQL statement. */ static int |