diff options
Diffstat (limited to 'sql')
47 files changed, 5849 insertions, 1138 deletions
diff --git a/sql/Makefile.am b/sql/Makefile.am index a4f761fdc16..a9a0449fbb6 100644 --- a/sql/Makefile.am +++ b/sql/Makefile.am @@ -53,7 +53,7 @@ noinst_HEADERS = item.h item_func.h item_sum.h item_cmpfunc.h \ sql_manager.h sql_map.h sql_string.h unireg.h \ sql_error.h field.h handler.h mysqld_suffix.h \ ha_heap.h ha_myisam.h ha_myisammrg.h ha_partition.h \ - opt_range.h protocol.h \ + opt_range.h protocol.h rpl_tblmap.h \ sql_select.h structs.h table.h sql_udf.h hash_filo.h\ lex.h lex_symbol.h sql_acl.h sql_crypt.h \ log_event.h sql_repl.h slave.h rpl_filter.h \ @@ -87,7 +87,7 @@ mysqld_SOURCES = sql_lex.cc sql_handler.cc sql_partition.cc \ sql_db.cc sql_table.cc sql_rename.cc sql_crypt.cc \ sql_load.cc mf_iocache.cc field_conv.cc sql_show.cc \ sql_udf.cc sql_analyse.cc sql_analyse.h sql_cache.cc \ - slave.cc sql_repl.cc rpl_filter.cc \ + slave.cc sql_repl.cc rpl_filter.cc rpl_tblmap.cc \ sql_union.cc sql_derived.cc \ client.c sql_client.cc mini_client_errors.c pack.c\ stacktrace.c repl_failsafe.h repl_failsafe.cc \ @@ -96,7 +96,7 @@ mysqld_SOURCES = sql_lex.cc sql_handler.cc sql_partition.cc \ tztime.cc my_time.c my_decimal.cc\ sp_head.cc sp_pcontext.cc sp_rcontext.cc sp.cc \ sp_cache.cc parse_file.cc sql_trigger.cc \ - sql_plugin.cc\ + sql_plugin.cc sql_binlog.cc \ handlerton.cc EXTRA_mysqld_SOURCES = ha_innodb.cc ha_berkeley.cc ha_archive.cc \ ha_innodb.h ha_berkeley.h ha_archive.h \ diff --git a/sql/ha_innodb.cc b/sql/ha_innodb.cc index 8b0cbe87562..d978327f2ce 100644 --- a/sql/ha_innodb.cc +++ b/sql/ha_innodb.cc @@ -827,6 +827,7 @@ ha_innobase::ha_innobase(TABLE_SHARE *table_arg) HA_CAN_INDEX_BLOBS | HA_CAN_SQL_HANDLER | HA_NOT_EXACT_COUNT | + HA_PRIMARY_KEY_ALLOW_RANDOM_ACCESS | HA_PRIMARY_KEY_IN_READ_INDEX | HA_CAN_GEOMETRY | HA_TABLE_SCAN_ON_INDEX), @@ -3052,6 +3053,9 @@ ha_innobase::store_key_val_for_row( continue; } + /* In a column prefix index, we may need to truncate + the stored value: */ + cs = key_part->field->charset(); src_start = record + key_part->offset; @@ -3068,7 +3072,11 @@ ha_innobase::store_key_val_for_row( memcpy(buff, src_start, len); buff+=len; - /* Pad the unused space with spaces */ + /* Pad the unused space with spaces. Note that no + padding is ever needed for UCS-2 because in MySQL, + all UCS2 characters are 2 bytes, as MySQL does not + support surrogate pairs, which are needed to represent + characters in the range U+10000 to U+10FFFF. */ if (len < key_part->length) { len = key_part->length - len; @@ -3791,9 +3799,9 @@ ha_innobase::delete_row( } /************************************************************************** -Removes a new lock set on a row. This can be called after a row has been read -in the processing of an UPDATE or a DELETE query, if the option -innodb_locks_unsafe_for_binlog is set. */ +Removes a new lock set on a row, if it was not read optimistically. This can +be called after a row has been read in the processing of an UPDATE or a DELETE +query, if the option innodb_locks_unsafe_for_binlog is set. */ void ha_innobase::unlock_row(void) @@ -3803,7 +3811,7 @@ ha_innobase::unlock_row(void) DBUG_ENTER("ha_innobase::unlock_row"); - if (last_query_id != user_thd->query_id) { + if (UNIV_UNLIKELY(last_query_id != user_thd->query_id)) { ut_print_timestamp(stderr); sql_print_error("last_query_id is %lu != user_thd_query_id is " "%lu", (ulong) last_query_id, @@ -3811,9 +3819,45 @@ ha_innobase::unlock_row(void) mem_analyze_corruption((byte *) prebuilt->trx); ut_error; } - - if (srv_locks_unsafe_for_binlog) { + + switch (prebuilt->row_read_type) { + case ROW_READ_WITH_LOCKS: + if (!srv_locks_unsafe_for_binlog) { + break; + } + /* fall through */ + case ROW_READ_TRY_SEMI_CONSISTENT: row_unlock_for_mysql(prebuilt, FALSE); + break; + case ROW_READ_DID_SEMI_CONSISTENT: + prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT; + break; + } + + DBUG_VOID_RETURN; +} + +/* See handler.h and row0mysql.h for docs on this function. */ +bool +ha_innobase::was_semi_consistent_read(void) +/*=======================================*/ +{ + row_prebuilt_t* prebuilt = (row_prebuilt_t*) innobase_prebuilt; + + return(prebuilt->row_read_type == ROW_READ_DID_SEMI_CONSISTENT); +} + +/* See handler.h and row0mysql.h for docs on this function. */ +void +ha_innobase::try_semi_consistent_read(bool yes) +/*===========================================*/ +{ + row_prebuilt_t* prebuilt = (row_prebuilt_t*) innobase_prebuilt; + + if (yes && srv_locks_unsafe_for_binlog) { + prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT; + } else { + prebuilt->row_read_type = ROW_READ_WITH_LOCKS; } } @@ -4328,6 +4372,13 @@ ha_innobase::rnd_init( err = change_active_index(primary_key); } + /* Don't use semi-consistent read in random row reads (by position). + This means we must disable semi_consistent_read if scan is false */ + + if (!scan) { + try_semi_consistent_read(0); + } + start_of_scan = 1; return(err); diff --git a/sql/ha_innodb.h b/sql/ha_innodb.h index f9a185bd885..fd0d3aa7e8c 100644 --- a/sql/ha_innodb.h +++ b/sql/ha_innodb.h @@ -122,6 +122,8 @@ class ha_innobase: public handler int write_row(byte * buf); int update_row(const byte * old_data, byte * new_data); int delete_row(const byte * buf); + bool was_semi_consistent_read(); + void try_semi_consistent_read(bool yes); void unlock_row(); int index_init(uint index, bool sorted); diff --git a/sql/ha_partition.cc b/sql/ha_partition.cc index 8b4e0d9cfee..f20fb7304ba 100644 --- a/sql/ha_partition.cc +++ b/sql/ha_partition.cc @@ -647,7 +647,7 @@ bool ha_partition::create_handler_file(const char *name) if (!m_is_sub_partitioned) { name_buffer_ptr= strmov(name_buffer_ptr, part_elem->partition_name)+1; - *engine_array= (uchar) part_elem->engine_type; + *engine_array= (uchar) ha_legacy_type(part_elem->engine_type); DBUG_PRINT("info", ("engine: %u", *engine_array)); engine_array++; } @@ -660,7 +660,7 @@ bool ha_partition::create_handler_file(const char *name) name_buffer_ptr+= name_add(name_buffer_ptr, part_elem->partition_name, subpart_elem->partition_name); - *engine_array= (uchar) part_elem->engine_type; + *engine_array= (uchar) ha_legacy_type(part_elem->engine_type); engine_array++; } } diff --git a/sql/handler.cc b/sql/handler.cc index bcccdf2e2b0..59445a1b2f1 100644 --- a/sql/handler.cc +++ b/sql/handler.cc @@ -22,6 +22,7 @@ #endif #include "mysql_priv.h" +#include "rpl_filter.h" #include "ha_heap.h" #include "ha_myisam.h" #include "ha_myisammrg.h" @@ -29,7 +30,7 @@ #include <myisampack.h> #include <errno.h> - + #ifdef WITH_NDBCLUSTER_STORAGE_ENGINE #define NDB_MAX_ATTRIBUTES_IN_TABLE 128 #include "ha_ndbcluster.h" @@ -37,12 +38,15 @@ #ifdef WITH_PARTITION_STORAGE_ENGINE #include "ha_partition.h" #endif + #ifdef WITH_INNOBASE_STORAGE_ENGINE #include "ha_innodb.h" #endif extern handlerton *sys_table_types[]; +#define BITMAP_STACKBUF_SIZE (128/8) + /* static functions defined in this file */ static handler *create_default(TABLE_SHARE *table); @@ -1937,6 +1941,9 @@ void handler::print_error(int error, myf errflag) my_error(ER_NO_SUCH_TABLE, MYF(0), table_share->db.str, table_share->table_name.str); break; + case HA_ERR_RBR_LOGGING_FAILED: + textno= ER_BINLOG_ROW_LOGGING_FAILED; + break; default: { /* The error was "unknown" to this function. @@ -2721,6 +2728,7 @@ TYPELIB *ha_known_exts(void) return &known_extensions; } + static bool stat_print(THD *thd, const char *type, uint type_len, const char *file, uint file_len, const char *status, uint status_len) @@ -2781,6 +2789,106 @@ bool ha_show_status(THD *thd, handlerton *db_type, enum ha_stat_type stat) return result; } +/* + Function to check if the conditions for row-based binlogging is + correct for the table. + + A row in the given table should be replicated if: + - Row-based replication is on + - It is not a temporary table + - The binlog is enabled + - The table shall be binlogged (binlog_*_db rules) [Seems disabled /Matz] +*/ + +#ifdef HAVE_ROW_BASED_REPLICATION +static bool check_table_binlog_row_based(THD *thd, TABLE *table) +{ + return + binlog_row_based && + thd && (thd->options & OPTION_BIN_LOG) && + (table->s->tmp_table == NO_TMP_TABLE); +} + +template<class RowsEventT> int binlog_log_row(TABLE* table, + const byte *before_record, + const byte *after_record) +{ + bool error= 0; + THD *const thd= current_thd; + + if (check_table_binlog_row_based(thd, table)) + { + MY_BITMAP cols; + /* Potential buffer on the stack for the bitmap */ + uint32 bitbuf[BITMAP_STACKBUF_SIZE/sizeof(uint32)]; + uint n_fields= table->s->fields; + my_bool use_bitbuf= n_fields <= sizeof(bitbuf)*8; + if (likely(!(error= bitmap_init(&cols, + use_bitbuf ? bitbuf : NULL, + (n_fields + 7) & ~7UL, + false)))) + { + bitmap_set_all(&cols); + error= + RowsEventT::binlog_row_logging_function(thd, table, + table->file->has_transactions(), + &cols, table->s->fields, + before_record, after_record); + if (!use_bitbuf) + bitmap_free(&cols); + } + } + return error ? HA_ERR_RBR_LOGGING_FAILED : 0; +} + + +/* + Instantiate the versions we need for the above template function, because we + have -fno-implicit-template as compiling option. +*/ + +template int binlog_log_row<Write_rows_log_event>(TABLE *, const byte *, const byte *); +template int binlog_log_row<Delete_rows_log_event>(TABLE *, const byte *, const byte *); +template int binlog_log_row<Update_rows_log_event>(TABLE *, const byte *, const byte *); + +#endif /* HAVE_ROW_BASED_REPLICATION */ + +int handler::ha_write_row(byte *buf) +{ + int error; + if (likely(!(error= write_row(buf)))) + { +#ifdef HAVE_ROW_BASED_REPLICATION + error= binlog_log_row<Write_rows_log_event>(table, 0, buf); +#endif + } + return error; +} + +int handler::ha_update_row(const byte *old_data, byte *new_data) +{ + int error; + if (likely(!(error= update_row(old_data, new_data)))) + { +#ifdef HAVE_ROW_BASED_REPLICATION + error= binlog_log_row<Update_rows_log_event>(table, old_data, new_data); +#endif + } + return error; +} + +int handler::ha_delete_row(const byte *buf) +{ + int error; + if (likely(!(error= delete_row(buf)))) + { +#ifdef HAVE_ROW_BASED_REPLICATION + error= binlog_log_row<Delete_rows_log_event>(table, buf, 0); +#endif + } + return error; +} + #ifdef HAVE_REPLICATION /* diff --git a/sql/handler.h b/sql/handler.h index ff81a259a73..27b3ed3fab1 100644 --- a/sql/handler.h +++ b/sql/handler.h @@ -74,6 +74,13 @@ */ #define HA_CAN_INSERT_DELAYED (1 << 14) #define HA_PRIMARY_KEY_IN_READ_INDEX (1 << 15) +/* + If HA_PRIMARY_KEY_ALLOW_RANDOM_ACCESS is set, it means that the engine can + do this: the position of an arbitrary record can be retrieved using + position() when the table has a primary key, effectively allowing random + access on the table based on a given record. +*/ +#define HA_PRIMARY_KEY_ALLOW_RANDOM_ACCESS (1 << 16) #define HA_NOT_DELETE_WITH_CACHE (1 << 18) #define HA_NO_PREFIX_CHAR_KEYS (1 << 20) #define HA_CAN_FULLTEXT (1 << 21) @@ -1054,11 +1061,9 @@ public: uint get_index(void) const { return active_index; } virtual int open(const char *name, int mode, uint test_if_locked)=0; virtual int close(void)=0; - virtual int write_row(byte * buf) { return HA_ERR_WRONG_COMMAND; } - virtual int update_row(const byte * old_data, byte * new_data) - { return HA_ERR_WRONG_COMMAND; } - virtual int delete_row(const byte * buf) - { return HA_ERR_WRONG_COMMAND; } + virtual int ha_write_row(byte * buf); + virtual int ha_update_row(const byte * old_data, byte * new_data); + virtual int ha_delete_row(const byte * buf); /* SYNOPSIS start_bulk_update() @@ -1189,6 +1194,26 @@ public: virtual int extra_opt(enum ha_extra_function operation, ulong cache_size) { return extra(operation); } virtual int external_lock(THD *thd, int lock_type) { return 0; } + /* + In an UPDATE or DELETE, if the row under the cursor was locked by another + transaction, and the engine used an optimistic read of the last + committed row value under the cursor, then the engine returns 1 from this + function. MySQL must NOT try to update this optimistic value. If the + optimistic value does not match the WHERE condition, MySQL can decide to + skip over this row. Currently only works for InnoDB. This can be used to + avoid unnecessary lock waits. + + If this method returns nonzero, it will also signal the storage + engine that the next read will be a locking re-read of the row. + */ + virtual bool was_semi_consistent_read() { return 0; } + /* + Tell the engine whether it should avoid unnecessary lock waits. + If yes, in an UPDATE or DELETE, if the row under the cursor was locked + by another transaction, the engine may try an optimistic read of + the last committed row value under the cursor. + */ + virtual void try_semi_consistent_read(bool) {} virtual void unlock_row() {} virtual int start_stmt(THD *thd, thr_lock_type lock_type) {return 0;} /* @@ -1405,6 +1430,31 @@ public: virtual bool check_if_incompatible_data(HA_CREATE_INFO *create_info, uint table_changes) { return COMPATIBLE_DATA_NO; } + +private: + + /* + Row-level primitives for storage engines. + These should be overridden by the storage engine class. To call + these methods, use the corresponding 'ha_*' method above. + */ + friend int ndb_add_binlog_index(THD *, void *); + + virtual int write_row(byte *buf __attribute__((unused))) + { + return HA_ERR_WRONG_COMMAND; + } + + virtual int update_row(const byte *old_data __attribute__((unused)), + byte *new_data __attribute__((unused))) + { + return HA_ERR_WRONG_COMMAND; + } + + virtual int delete_row(const byte *buf __attribute__((unused))) + { + return HA_ERR_WRONG_COMMAND; + } }; /* Some extern variables used with handlers */ diff --git a/sql/item_sum.cc b/sql/item_sum.cc index a8163a1758d..dc1cf6cc8b7 100644 --- a/sql/item_sum.cc +++ b/sql/item_sum.cc @@ -2662,7 +2662,7 @@ bool Item_sum_count_distinct::add() */ return tree->unique_add(table->record[0] + table->s->null_bytes); } - if ((error= table->file->write_row(table->record[0])) && + if ((error= table->file->ha_write_row(table->record[0])) && error != HA_ERR_FOUND_DUPP_KEY && error != HA_ERR_FOUND_DUPP_UNIQUE) return TRUE; diff --git a/sql/log.cc b/sql/log.cc index d30cf3266f9..44d3869e9d5 100644 --- a/sql/log.cc +++ b/sql/log.cc @@ -47,6 +47,19 @@ static int binlog_commit(THD *thd, bool all); static int binlog_rollback(THD *thd, bool all); static int binlog_prepare(THD *thd, bool all); +/* + This is a POD. Please keep it that way! + + Don't add constructors, destructors, or virtual functions. +*/ +struct binlog_trx_data { + bool empty() const { + return pending == NULL && my_b_tell(&trans_log) == 0; + } + IO_CACHE trans_log; // The transaction cache + Rows_log_event *pending; // The pending binrows event +}; + handlerton binlog_hton = { MYSQL_HANDLERTON_INTERFACE_VERSION, "binlog", @@ -92,19 +105,45 @@ bool binlog_init() static int binlog_close_connection(THD *thd) { - IO_CACHE *trans_log= (IO_CACHE*)thd->ha_data[binlog_hton.slot]; - DBUG_ASSERT(mysql_bin_log.is_open() && !my_b_tell(trans_log)); + binlog_trx_data *const trx_data= + (binlog_trx_data*) thd->ha_data[binlog_hton.slot]; + IO_CACHE *trans_log= &trx_data->trans_log; + DBUG_ASSERT(mysql_bin_log.is_open() && trx_data->empty()); close_cached_file(trans_log); - my_free((gptr)trans_log, MYF(0)); + thd->ha_data[binlog_hton.slot]= 0; + my_free((gptr)trx_data, MYF(0)); return 0; } -static int binlog_end_trans(THD *thd, IO_CACHE *trans_log, Log_event *end_ev) +static int +binlog_end_trans(THD *thd, binlog_trx_data *trx_data, Log_event *end_ev) { - int error=0; DBUG_ENTER("binlog_end_trans"); + int error=0; + IO_CACHE *trans_log= &trx_data->trans_log; + if (end_ev) + { + thd->binlog_flush_pending_rows_event(true); error= mysql_bin_log.write(thd, trans_log, end_ev); + } + else + { + thd->binlog_delete_pending_rows_event(); + } + + /* + We need to step the table map version both after writing the + entire transaction to the log file and after rolling back the + transaction. + + We need to step the table map version after writing the + transaction cache to disk. In addition, we need to step the table + map version on a rollback to ensure that a new table map event is + generated instead of the one that was written to the thrown-away + transaction cache. + */ + ++mysql_bin_log.m_table_map_version; statistic_increment(binlog_cache_use, &LOCK_status); if (trans_log->disk_writes != 0) @@ -130,32 +169,36 @@ static int binlog_prepare(THD *thd, bool all) static int binlog_commit(THD *thd, bool all) { - IO_CACHE *trans_log= (IO_CACHE*)thd->ha_data[binlog_hton.slot]; DBUG_ENTER("binlog_commit"); + binlog_trx_data *const trx_data= + (binlog_trx_data*) thd->ha_data[binlog_hton.slot]; + IO_CACHE *trans_log= &trx_data->trans_log; DBUG_ASSERT(mysql_bin_log.is_open() && (all || !(thd->options & (OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)))); - if (!my_b_tell(trans_log)) + if (trx_data->empty()) { // we're here because trans_log was flushed in MYSQL_LOG::log() DBUG_RETURN(0); } Query_log_event qev(thd, STRING_WITH_LEN("COMMIT"), TRUE, FALSE); - DBUG_RETURN(binlog_end_trans(thd, trans_log, &qev)); + DBUG_RETURN(binlog_end_trans(thd, trx_data, &qev)); } static int binlog_rollback(THD *thd, bool all) { - int error=0; - IO_CACHE *trans_log= (IO_CACHE*)thd->ha_data[binlog_hton.slot]; DBUG_ENTER("binlog_rollback"); + int error=0; + binlog_trx_data *const trx_data= + (binlog_trx_data*) thd->ha_data[binlog_hton.slot]; + IO_CACHE *trans_log= &trx_data->trans_log; /* First assert is guaranteed - see trans_register_ha() call below. The second must be true. If it is not, we're registering unnecessary, doing extra work. The cause should be found and eliminated */ DBUG_ASSERT(all || !(thd->options & (OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))); - DBUG_ASSERT(mysql_bin_log.is_open() && my_b_tell(trans_log)); + DBUG_ASSERT(mysql_bin_log.is_open() && !trx_data->empty()); /* Update the binary log with a BEGIN/ROLLBACK block if we have cached some queries and we updated some non-transactional @@ -165,10 +208,10 @@ static int binlog_rollback(THD *thd, bool all) if (unlikely(thd->options & OPTION_STATUS_NO_TRANS_UPDATE)) { Query_log_event qev(thd, STRING_WITH_LEN("ROLLBACK"), TRUE, FALSE); - error= binlog_end_trans(thd, trans_log, &qev); + error= binlog_end_trans(thd, trx_data, &qev); } else - error= binlog_end_trans(thd, trans_log, 0); + error= binlog_end_trans(thd, trx_data, 0); DBUG_RETURN(error); } @@ -195,8 +238,10 @@ static int binlog_rollback(THD *thd, bool all) static int binlog_savepoint_set(THD *thd, void *sv) { - IO_CACHE *trans_log= (IO_CACHE*)thd->ha_data[binlog_hton.slot]; DBUG_ENTER("binlog_savepoint_set"); + binlog_trx_data *const trx_data= + (binlog_trx_data*) thd->ha_data[binlog_hton.slot]; + IO_CACHE *trans_log= &trx_data->trans_log; DBUG_ASSERT(mysql_bin_log.is_open() && my_b_tell(trans_log)); *(my_off_t *)sv= my_b_tell(trans_log); @@ -207,8 +252,10 @@ static int binlog_savepoint_set(THD *thd, void *sv) static int binlog_savepoint_rollback(THD *thd, void *sv) { - IO_CACHE *trans_log= (IO_CACHE*)thd->ha_data[binlog_hton.slot]; DBUG_ENTER("binlog_savepoint_rollback"); + binlog_trx_data *const trx_data= + (binlog_trx_data*) thd->ha_data[binlog_hton.slot]; + IO_CACHE *trans_log= &trx_data->trans_log; DBUG_ASSERT(mysql_bin_log.is_open() && my_b_tell(trans_log)); /* @@ -367,6 +414,7 @@ MYSQL_LOG::MYSQL_LOG() :bytes_written(0), last_time(0), query_start(0), name(0), prepared_xids(0), log_type(LOG_CLOSED), file_id(1), open_count(1), write_error(FALSE), inited(FALSE), need_start_event(TRUE), + m_table_map_version(0), description_event_for_exec(0), description_event_for_queue(0) { /* @@ -1363,7 +1411,7 @@ void MYSQL_LOG::new_file(bool need_lock) to change base names at some point. */ THD *thd = current_thd; /* may be 0 if we are reacting to SIGHUP */ - Rotate_log_event r(thd,new_name+dirname_length(new_name), + Rotate_log_event r(new_name+dirname_length(new_name), 0, LOG_EVENT_OFFSET, 0); r.write(&log_file); bytes_written += r.data_written; @@ -1589,6 +1637,162 @@ bool MYSQL_LOG::is_query_in_union(THD *thd, query_id_t query_id_param) query_id_param >= thd->binlog_evt_union.first_query_id); } + +/* + These functions are placed in this file since they need access to + binlog_hton, which has internal linkage. +*/ + +int THD::binlog_setup_trx_data() +{ + DBUG_ENTER("THD::binlog_setup_trx_data"); + binlog_trx_data *trx_data= + (binlog_trx_data*) ha_data[binlog_hton.slot]; + + if (trx_data) + DBUG_RETURN(0); // Already set up + + ha_data[binlog_hton.slot]= trx_data= + (binlog_trx_data*) my_malloc(sizeof(binlog_trx_data), MYF(MY_ZEROFILL)); + if (!trx_data || + open_cached_file(&trx_data->trans_log, mysql_tmpdir, + LOG_PREFIX, binlog_cache_size, MYF(MY_WME))) + { + my_free((gptr)trx_data, MYF(MY_ALLOW_ZERO_PTR)); + ha_data[binlog_hton.slot]= 0; + DBUG_RETURN(1); // Didn't manage to set it up + } + trx_data->trans_log.end_of_file= max_binlog_cache_size; + DBUG_RETURN(0); +} + +Rows_log_event* +THD::binlog_get_pending_rows_event() const +{ + binlog_trx_data *const trx_data= + (binlog_trx_data*) ha_data[binlog_hton.slot]; + /* + This is less than ideal, but here's the story: If there is no + trx_data, prepare_pending_rows_event() has never been called + (since the trx_data is set up there). In that case, we just return + NULL. + */ + return trx_data ? trx_data->pending : NULL; +} + +void +THD::binlog_set_pending_rows_event(Rows_log_event* ev) +{ + binlog_trx_data *const trx_data= + (binlog_trx_data*) ha_data[binlog_hton.slot]; + DBUG_ASSERT(trx_data); + trx_data->pending= ev; +} + + +/* + Moves the last bunch of rows from the pending Rows event to the binlog + (either cached binlog if transaction, or disk binlog). Sets a new pending + event. +*/ +int MYSQL_LOG::flush_and_set_pending_rows_event(THD *thd, Rows_log_event* event) +{ + DBUG_ENTER("MYSQL_LOG::flush_and_set_pending_rows_event(event)"); + DBUG_ASSERT(binlog_row_based && mysql_bin_log.is_open()); + DBUG_PRINT("enter", ("event=%p", event)); + + int error= 0; + + binlog_trx_data *const trx_data= + (binlog_trx_data*) thd->ha_data[binlog_hton.slot]; + + DBUG_ASSERT(trx_data); + + if (Rows_log_event* pending= trx_data->pending) + { + IO_CACHE *file= &log_file; + + /* + Decide if we should write to the log file directly or to the + transaction log. + */ + if (pending->get_cache_stmt() || my_b_tell(&trx_data->trans_log)) + file= &trx_data->trans_log; + + /* + If we are writing to the log file directly, we could avoid + locking the log. This does not work since we need to step the + m_table_map_version below, and that change has to be protected + by the LOCK_log mutex. + */ + pthread_mutex_lock(&LOCK_log); + + /* + Write a table map if necessary + */ + if (pending->maybe_write_table_map(thd, file, this)) + { + pthread_mutex_unlock(&LOCK_log); + DBUG_RETURN(2); + } + + /* + Write pending event to log file or transaction cache + */ + if (pending->write(file)) + { + pthread_mutex_unlock(&LOCK_log); + DBUG_RETURN(1); + } + + /* + We step the table map version if we are writing an event + representing the end of a statement. We do this regardless of + wheather we write to the transaction cache or to directly to the + file. + + In an ideal world, we could avoid stepping the table map version + if we were writing to a transaction cache, since we could then + reuse the table map that was written earlier in the transaction + cache. This does not work since STMT_END_F implies closing all + table mappings on the slave side. + + TODO: Find a solution so that table maps does not have to be + written several times within a transaction. + */ + if (pending->get_flags(Rows_log_event::STMT_END_F)) + ++m_table_map_version; + + delete pending; + + if (file == &log_file) + { + error= flush_and_sync(); + if (!error) + { + signal_update(); + rotate_and_purge(RP_LOCK_LOG_IS_ALREADY_LOCKED); + } + } + + pthread_mutex_unlock(&LOCK_log); + } + else if (event && event->get_cache_stmt()) /* && pending == 0 */ + { + /* + If we are setting a non-null event for a table that is + transactional, we start a transaction here as well. + */ + trans_register_ha(thd, + thd->options & (OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN), + &binlog_hton); + } + + trx_data->pending= event; + + DBUG_RETURN(error); +} + /* Write an event to the binary log */ @@ -1609,7 +1813,29 @@ bool MYSQL_LOG::write(Log_event *event_info) thd->binlog_evt_union.unioned_events_trans |= event_info->cache_stmt; DBUG_RETURN(0); } - + + /* + Flush the pending rows event to the transaction cache or to the + log file. Since this function potentially aquire the LOCK_log + mutex, we do this before aquiring the LOCK_log mutex in this + function. + + This is not optimal, but necessary in the current implementation + since there is code that writes rows to system tables without + using some way to flush the pending event (e.g., binlog_query()). + + TODO: There shall be no writes to any system table after calling + binlog_query(), so these writes has to be moved to before the call + of binlog_query() for correct functioning. + + This is necessesary not only for RBR, but the master might crash + after binlogging the query but before changing the system tables. + This means that the slave and the master are not in the same state + (after the master has restarted), so therefore we have to + eliminate this problem. + */ + thd->binlog_flush_pending_rows_event(true); + pthread_mutex_lock(&LOCK_log); /* @@ -1649,37 +1875,26 @@ bool MYSQL_LOG::write(Log_event *event_info) */ if (opt_using_transactions && thd) { - IO_CACHE *trans_log= (IO_CACHE*)thd->ha_data[binlog_hton.slot]; + if (thd->binlog_setup_trx_data()) + goto err; - if (event_info->get_cache_stmt()) - { - if (!trans_log) - { - thd->ha_data[binlog_hton.slot]= trans_log= (IO_CACHE *) - my_malloc(sizeof(IO_CACHE), MYF(MY_ZEROFILL)); - if (!trans_log || open_cached_file(trans_log, mysql_tmpdir, - LOG_PREFIX, - binlog_cache_size, MYF(MY_WME))) - { - my_free((gptr)trans_log, MYF(MY_ALLOW_ZERO_PTR)); - thd->ha_data[binlog_hton.slot]= trans_log= 0; - goto err; - } - trans_log->end_of_file= max_binlog_cache_size; - trans_register_ha(thd, - thd->options & (OPTION_NOT_AUTOCOMMIT | - OPTION_BEGIN), - &binlog_hton); - } - else if (!my_b_tell(trans_log)) - trans_register_ha(thd, - thd->options & (OPTION_NOT_AUTOCOMMIT | - OPTION_BEGIN), - &binlog_hton); - file= trans_log; - } - else if (trans_log && my_b_tell(trans_log)) + binlog_trx_data *const trx_data= + (binlog_trx_data*) thd->ha_data[binlog_hton.slot]; + IO_CACHE *trans_log= &trx_data->trans_log; + + if (event_info->get_cache_stmt() && !my_b_tell(trans_log)) + trans_register_ha(thd, + thd->options & (OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN), + &binlog_hton); + + if (event_info->get_cache_stmt() || my_b_tell(trans_log)) file= trans_log; + /* + Note: as Mats suggested, for all the cases above where we write to + trans_log, it sounds unnecessary to lock LOCK_log. We should rather + test first if we want to write to trans_log, and if not, lock + LOCK_log. TODO. + */ } #endif DBUG_PRINT("info",("event type=%d",event_info->get_type_code())); @@ -1694,42 +1909,49 @@ bool MYSQL_LOG::write(Log_event *event_info) of the SQL command */ + /* + If row-based binlogging, Insert_id, Rand and other kind of "setting + context" events are not needed. + */ if (thd) { - if (thd->last_insert_id_used) + if (!binlog_row_based) { - Intvar_log_event e(thd,(uchar) LAST_INSERT_ID_EVENT, - thd->current_insert_id); - if (e.write(file)) - goto err; - } - if (thd->insert_id_used) - { - Intvar_log_event e(thd,(uchar) INSERT_ID_EVENT,thd->last_insert_id); - if (e.write(file)) - goto err; - } - if (thd->rand_used) - { - Rand_log_event e(thd,thd->rand_saved_seed1,thd->rand_saved_seed2); - if (e.write(file)) - goto err; - } - if (thd->user_var_events.elements) - { - for (uint i= 0; i < thd->user_var_events.elements; i++) - { - BINLOG_USER_VAR_EVENT *user_var_event; - get_dynamic(&thd->user_var_events,(gptr) &user_var_event, i); - User_var_log_event e(thd, user_var_event->user_var_event->name.str, - user_var_event->user_var_event->name.length, - user_var_event->value, - user_var_event->length, - user_var_event->type, - user_var_event->charset_number); - if (e.write(file)) - goto err; - } + if (thd->last_insert_id_used) + { + Intvar_log_event e(thd,(uchar) LAST_INSERT_ID_EVENT, + thd->current_insert_id); + if (e.write(file)) + goto err; + } + if (thd->insert_id_used) + { + Intvar_log_event e(thd,(uchar) INSERT_ID_EVENT,thd->last_insert_id); + if (e.write(file)) + goto err; + } + if (thd->rand_used) + { + Rand_log_event e(thd,thd->rand_saved_seed1,thd->rand_saved_seed2); + if (e.write(file)) + goto err; + } + if (thd->user_var_events.elements) + { + for (uint i= 0; i < thd->user_var_events.elements; i++) + { + BINLOG_USER_VAR_EVENT *user_var_event; + get_dynamic(&thd->user_var_events,(gptr) &user_var_event, i); + User_var_log_event e(thd, user_var_event->user_var_event->name.str, + user_var_event->user_var_event->name.length, + user_var_event->value, + user_var_event->length, + user_var_event->type, + user_var_event->charset_number); + if (e.write(file)) + goto err; + } + } } } @@ -1760,6 +1982,9 @@ err: } } + if (event_info->flags & LOG_EVENT_UPDATE_TABLE_MAP_VERSION_F) + ++m_table_map_version; + pthread_mutex_unlock(&LOCK_log); DBUG_RETURN(error); } @@ -2307,6 +2532,44 @@ void MYSQL_LOG::signal_update() DBUG_VOID_RETURN; } +#ifndef MYSQL_CLIENT +bool MYSQL_LOG::write_table_map(THD *thd, IO_CACHE *file, TABLE* table, + bool is_transactional) +{ + DBUG_ENTER("MYSQL_LOG::write_table_map()"); + DBUG_PRINT("enter", ("table=%p (%s: %u)", + table, table->s->table_name, table->s->table_map_id)); + + /* Pre-conditions */ + DBUG_ASSERT(binlog_row_based && is_open()); + DBUG_ASSERT(table->s->table_map_id != ULONG_MAX); + +#ifndef DBUG_OFF + /* + We only need to execute under the LOCK_log mutex if we are writing + to the log file; otherwise, we are writing to a thread-specific + transaction cache and there is no need to serialize this event + with events in other threads. + */ + if (file == &log_file) + safe_mutex_assert_owner(&LOCK_log); +#endif + + Table_map_log_event::flag_set const + flags= Table_map_log_event::NO_FLAGS; + + Table_map_log_event + the_event(thd, table, table->s->table_map_id, is_transactional, flags); + + if (the_event.write(file)) + DBUG_RETURN(1); + + table->s->table_map_version= m_table_map_version; + DBUG_RETURN(0); +} +#endif /* !defined(MYSQL_CLIENT) */ + + #ifdef __NT__ void print_buffer_to_nt_eventlog(enum loglevel level, char *buff, uint length, int buffLen) @@ -3013,9 +3276,11 @@ void TC_LOG_BINLOG::close() */ int TC_LOG_BINLOG::log(THD *thd, my_xid xid) { + DBUG_ENTER("TC_LOG_BINLOG::log"); Xid_log_event xle(thd, xid); - IO_CACHE *trans_log= (IO_CACHE*)thd->ha_data[binlog_hton.slot]; - return !binlog_end_trans(thd, trans_log, &xle); // invert return value + binlog_trx_data *trx_data= + (binlog_trx_data*) thd->ha_data[binlog_hton.slot]; + DBUG_RETURN(!binlog_end_trans(thd, trx_data, &xle)); // invert return value } void TC_LOG_BINLOG::unlog(ulong cookie, my_xid xid) diff --git a/sql/log.h b/sql/log.h new file mode 100644 index 00000000000..ea2946c2d86 --- /dev/null +++ b/sql/log.h @@ -0,0 +1,332 @@ +/* Copyright (C) 2005 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#ifndef LOG_H +#define LOG_H + +struct st_relay_log_info; + +class Format_description_log_event; + +/* + Transaction Coordinator log - a base abstract class + for two different implementations +*/ +class TC_LOG +{ + public: + int using_heuristic_recover(); + TC_LOG() {} + virtual ~TC_LOG() {} + + virtual int open(const char *opt_name)=0; + virtual void close()=0; + virtual int log(THD *thd, my_xid xid)=0; + virtual void unlog(ulong cookie, my_xid xid)=0; +}; + +class TC_LOG_DUMMY: public TC_LOG // use it to disable the logging +{ + public: + int open(const char *opt_name) { return 0; } + void close() { } + int log(THD *thd, my_xid xid) { return 1; } + void unlog(ulong cookie, my_xid xid) { } +}; + +#ifdef HAVE_MMAP +class TC_LOG_MMAP: public TC_LOG +{ + public: // only to keep Sun Forte on sol9x86 happy + typedef enum { + POOL, // page is in pool + ERROR, // last sync failed + DIRTY // new xids added since last sync + } PAGE_STATE; + + private: + typedef struct st_page { + struct st_page *next; // page a linked in a fifo queue + my_xid *start, *end; // usable area of a page + my_xid *ptr; // next xid will be written here + int size, free; // max and current number of free xid slots on the page + int waiters; // number of waiters on condition + PAGE_STATE state; // see above + pthread_mutex_t lock; // to access page data or control structure + pthread_cond_t cond; // to wait for a sync + } PAGE; + + char logname[FN_REFLEN]; + File fd; + my_off_t file_length; + uint npages, inited; + uchar *data; + struct st_page *pages, *syncing, *active, *pool, *pool_last; + /* + note that, e.g. LOCK_active is only used to protect + 'active' pointer, to protect the content of the active page + one has to use active->lock. + Same for LOCK_pool and LOCK_sync + */ + pthread_mutex_t LOCK_active, LOCK_pool, LOCK_sync; + pthread_cond_t COND_pool, COND_active; + + public: + TC_LOG_MMAP(): inited(0) {} + int open(const char *opt_name); + void close(); + int log(THD *thd, my_xid xid); + void unlog(ulong cookie, my_xid xid); + int recover(); + + private: + void get_active_from_pool(); + int sync(); + int overflow(); +}; +#else +#define TC_LOG_MMAP TC_LOG_DUMMY +#endif + +extern TC_LOG *tc_log; +extern TC_LOG_MMAP tc_log_mmap; +extern TC_LOG_DUMMY tc_log_dummy; + +/* log info errors */ +#define LOG_INFO_EOF -1 +#define LOG_INFO_IO -2 +#define LOG_INFO_INVALID -3 +#define LOG_INFO_SEEK -4 +#define LOG_INFO_MEM -6 +#define LOG_INFO_FATAL -7 +#define LOG_INFO_IN_USE -8 + +/* bitmap to SQL_LOG::close() */ +#define LOG_CLOSE_INDEX 1 +#define LOG_CLOSE_TO_BE_OPENED 2 +#define LOG_CLOSE_STOP_EVENT 4 + +struct st_relay_log_info; + +typedef struct st_log_info +{ + char log_file_name[FN_REFLEN]; + my_off_t index_file_offset, index_file_start_offset; + my_off_t pos; + bool fatal; // if the purge happens to give us a negative offset + pthread_mutex_t lock; + st_log_info():fatal(0) { pthread_mutex_init(&lock, MY_MUTEX_INIT_FAST);} + ~st_log_info() { pthread_mutex_destroy(&lock);} +} LOG_INFO; + +class Log_event; +class Rows_log_event; + +enum enum_log_type { LOG_CLOSED, LOG_TO_BE_OPENED, LOG_NORMAL, LOG_NEW, LOG_BIN}; + +/* + TODO split MYSQL_LOG into base MYSQL_LOG and + MYSQL_QUERY_LOG, MYSQL_SLOW_LOG, MYSQL_BIN_LOG + most of the code from MYSQL_LOG should be in the MYSQL_BIN_LOG + only (TC_LOG included) + + TODO use mmap instead of IO_CACHE for binlog + (mmap+fsync is two times faster than write+fsync) +*/ + +class MYSQL_LOG: public TC_LOG +{ + private: + /* LOCK_log and LOCK_index are inited by init_pthread_objects() */ + pthread_mutex_t LOCK_log, LOCK_index; + pthread_mutex_t LOCK_prep_xids; + pthread_cond_t COND_prep_xids; + pthread_cond_t update_cond; + ulonglong bytes_written; + time_t last_time,query_start; + IO_CACHE log_file; + IO_CACHE index_file; + char *name; + char time_buff[20],db[NAME_LEN+1]; + char log_file_name[FN_REFLEN],index_file_name[FN_REFLEN]; + /* + The max size before rotation (usable only if log_type == LOG_BIN: binary + logs and relay logs). + For a binlog, max_size should be max_binlog_size. + For a relay log, it should be max_relay_log_size if this is non-zero, + max_binlog_size otherwise. + max_size is set in init(), and dynamically changed (when one does SET + GLOBAL MAX_BINLOG_SIZE|MAX_RELAY_LOG_SIZE) by fix_max_binlog_size and + fix_max_relay_log_size). + */ + ulong max_size; + ulong prepared_xids; /* for tc log - number of xids to remember */ + volatile enum_log_type log_type; + enum cache_type io_cache_type; + // current file sequence number for load data infile binary logging + uint file_id; + uint open_count; // For replication + int readers_count; + bool write_error, inited; + bool need_start_event; + /* + no_auto_events means we don't want any of these automatic events : + Start/Rotate/Stop. That is, in 4.x when we rotate a relay log, we don't + want a Rotate_log event to be written to the relay log. When we start a + relay log etc. So in 4.x this is 1 for relay logs, 0 for binlogs. + In 5.0 it's 0 for relay logs too! + */ + bool no_auto_events; + friend class Log_event; + +public: + ulonglong m_table_map_version; + + /* + These describe the log's format. This is used only for relay logs. + _for_exec is used by the SQL thread, _for_queue by the I/O thread. It's + necessary to have 2 distinct objects, because the I/O thread may be reading + events in a different format from what the SQL thread is reading (consider + the case of a master which has been upgraded from 5.0 to 5.1 without doing + RESET MASTER, or from 4.x to 5.0). + */ + Format_description_log_event *description_event_for_exec, + *description_event_for_queue; + + MYSQL_LOG(); + /* + note that there's no destructor ~MYSQL_LOG() ! + The reason is that we don't want it to be automatically called + on exit() - but only during the correct shutdown process + */ + + int open(const char *opt_name); + void close(); + int log(THD *thd, my_xid xid); + void unlog(ulong cookie, my_xid xid); + int recover(IO_CACHE *log, Format_description_log_event *fdle); +#if !defined(MYSQL_CLIENT) + bool is_table_mapped(TABLE *table) const + { + return table->s->table_map_version == m_table_map_version; + } + + int flush_and_set_pending_rows_event(THD *thd, Rows_log_event* event); + +#endif /* !defined(MYSQL_CLIENT) */ + void reset_bytes_written() + { + bytes_written = 0; + } + void harvest_bytes_written(ulonglong* counter) + { +#ifndef DBUG_OFF + char buf1[22],buf2[22]; +#endif + DBUG_ENTER("harvest_bytes_written"); + (*counter)+=bytes_written; + DBUG_PRINT("info",("counter: %s bytes_written: %s", llstr(*counter,buf1), + llstr(bytes_written,buf2))); + bytes_written=0; + DBUG_VOID_RETURN; + } + void set_max_size(ulong max_size_arg); + void signal_update(); + void wait_for_update(THD* thd, bool master_or_slave); + void set_need_start_event() { need_start_event = 1; } + void init(enum_log_type log_type_arg, + enum cache_type io_cache_type_arg, + bool no_auto_events_arg, ulong max_size); + void init_pthread_objects(); + void cleanup(); + bool open(const char *log_name, + enum_log_type log_type, + const char *new_name, + enum cache_type io_cache_type_arg, + bool no_auto_events_arg, ulong max_size, + bool null_created); + const char *generate_name(const char *log_name, const char *suffix, + bool strip_ext, char *buff); + /* simplified open_xxx wrappers for the gigantic open above */ + bool open_query_log(const char *log_name) + { + char buf[FN_REFLEN]; + return open(generate_name(log_name, ".log", 0, buf), + LOG_NORMAL, 0, WRITE_CACHE, 0, 0, 0); + } + bool open_slow_log(const char *log_name) + { + char buf[FN_REFLEN]; + return open(generate_name(log_name, "-slow.log", 0, buf), + LOG_NORMAL, 0, WRITE_CACHE, 0, 0, 0); + } + bool open_index_file(const char *index_file_name_arg, + const char *log_name); + void new_file(bool need_lock); + bool write(THD *thd, enum enum_server_command command, + const char *format,...); + bool write(THD *thd, const char *query, uint query_length, + time_t query_start=0); + bool write(Log_event* event_info); // binary log write + bool write(THD *thd, IO_CACHE *cache, Log_event *commit_event); + + bool write_table_map(THD *thd, IO_CACHE *cache, TABLE *table, bool is_trans); + + void start_union_events(THD *thd); + void stop_union_events(THD *thd); + bool is_query_in_union(THD *thd, query_id_t query_id_param); + + /* + v stands for vector + invoked as appendv(buf1,len1,buf2,len2,...,bufn,lenn,0) + */ + bool appendv(const char* buf,uint len,...); + bool append(Log_event* ev); + + int generate_new_name(char *new_name,const char *old_name); + void make_log_name(char* buf, const char* log_ident); + bool is_active(const char* log_file_name); + int update_log_index(LOG_INFO* linfo, bool need_update_threads); + void rotate_and_purge(uint flags); + bool flush_and_sync(); + int purge_logs(const char *to_log, bool included, + bool need_mutex, bool need_update_threads, + ulonglong *decrease_log_space); + int purge_logs_before_date(time_t purge_time); + int purge_first_log(struct st_relay_log_info* rli, bool included); + bool reset_logs(THD* thd); + void close(uint exiting); + + // iterating through the log index file + int find_log_pos(LOG_INFO* linfo, const char* log_name, + bool need_mutex); + int find_next_log(LOG_INFO* linfo, bool need_mutex); + int get_current_log(LOG_INFO* linfo); + uint next_file_id(); + inline bool is_open() { return log_type != LOG_CLOSED; } + inline char* get_index_fname() { return index_file_name;} + inline char* get_log_fname() { return log_file_name; } + inline char* get_name() { return name; } + inline pthread_mutex_t* get_log_lock() { return &LOCK_log; } + inline IO_CACHE* get_log_file() { return &log_file; } + + inline void lock_index() { pthread_mutex_lock(&LOCK_index);} + inline void unlock_index() { pthread_mutex_unlock(&LOCK_index);} + inline IO_CACHE *get_index_file() { return &index_file;} + inline uint32 get_open_count() { return open_count; } +}; + +#endif /* LOG_H */ diff --git a/sql/log_event.cc b/sql/log_event.cc index c8f8ff40700..6e256a0c295 100644 --- a/sql/log_event.cc +++ b/sql/log_event.cc @@ -21,11 +21,14 @@ #pragma implementation // gcc: Class implementation #endif -#include "mysql_priv.h" +#include "mysql_priv.h" #include "slave.h" #include "rpl_filter.h" #include <my_dir.h> #endif /* MYSQL_CLIENT */ +#include <base64.h> +#include <my_bitmap.h> +#include <my_vle.h> #define log_cs &my_charset_latin1 @@ -232,6 +235,7 @@ char *str_to_hex(char *to, const char *from, uint len) commands just before it prints a query. */ +#ifdef MYSQL_CLIENT static void print_set_option(FILE* file, uint32 bits_changed, uint32 option, uint32 flags, const char* name, bool* need_comma) { @@ -243,6 +247,7 @@ static void print_set_option(FILE* file, uint32 bits_changed, uint32 option, *need_comma= 1; } } +#endif /************************************************************************** Log_event methods (= the parent class of all events) @@ -271,6 +276,10 @@ const char* Log_event::get_type_str() case XID_EVENT: return "Xid"; case USER_VAR_EVENT: return "User var"; case FORMAT_DESCRIPTION_EVENT: return "Format_desc"; + case TABLE_MAP_EVENT: return "Table_map"; + case WRITE_ROWS_EVENT: return "Write_rows"; + case UPDATE_ROWS_EVENT: return "Update_rows"; + case DELETE_ROWS_EVENT: return "Delete_rows"; case BEGIN_LOAD_QUERY_EVENT: return "Begin_load_query"; case EXECUTE_LOAD_QUERY_EVENT: return "Execute_load_query"; default: return "Unknown"; /* impossible */ @@ -778,6 +787,9 @@ Log_event* Log_event::read_log_event(const char* buf, uint event_len, DBUG_RETURN(NULL); // general sanity check - will fail on a partial read } + /* To check the integrity of the Log_event_type enumeration */ + DBUG_ASSERT(buf[EVENT_TYPE_OFFSET] < ENUM_END_EVENT); + switch(buf[EVENT_TYPE_OFFSET]) { case QUERY_EVENT: ev = new Query_log_event(buf, event_len, description_event, QUERY_EVENT); @@ -829,6 +841,20 @@ Log_event* Log_event::read_log_event(const char* buf, uint event_len, case FORMAT_DESCRIPTION_EVENT: ev = new Format_description_log_event(buf, event_len, description_event); break; +#if defined(HAVE_REPLICATION) && defined(HAVE_ROW_BASED_REPLICATION) + case WRITE_ROWS_EVENT: + ev = new Write_rows_log_event(buf, event_len, description_event); + break; + case UPDATE_ROWS_EVENT: + ev = new Update_rows_log_event(buf, event_len, description_event); + break; + case DELETE_ROWS_EVENT: + ev = new Delete_rows_log_event(buf, event_len, description_event); + break; + case TABLE_MAP_EVENT: + ev = new Table_map_log_event(buf, event_len, description_event); + break; +#endif case BEGIN_LOAD_QUERY_EVENT: ev = new Begin_load_query_log_event(buf, event_len, description_event); break; @@ -952,6 +978,24 @@ void Log_event::print_header(FILE* file, PRINT_EVENT_INFO* print_event_info) } +void Log_event::print_base64(FILE* file, PRINT_EVENT_INFO* print_event_info) +{ + uchar *ptr= (uchar*)temp_buf; + my_off_t size= uint4korr(ptr + EVENT_LEN_OFFSET); + + char *tmp_str= + (char *) my_malloc(base64_needed_encoded_length(size), MYF(MY_WME)); + if (!tmp_str) { + fprintf(stderr, "\nError: Out of memory. " + "Could not print correct binlog event.\n"); + return; + } + int res= base64_encode(ptr, size, tmp_str); + fprintf(file, "\nBINLOG '\n%s\n';\n", tmp_str); + my_free(tmp_str, MYF(0)); +} + + /* Log_event::print_timestamp() */ @@ -1714,7 +1758,7 @@ int Query_log_event::exec_event(struct st_relay_log_info* rli, const char *query clear_all_errors(thd, rli); /* Can ignore query */ else { - slave_print_error(rli,expected_error, + slave_print_msg(ERROR_LEVEL, rli, expected_error, "\ Query partially completed on the master (error on master: %d) \ and was aborted. There is a chance that your master is inconsistent at this \ @@ -1743,16 +1787,16 @@ compare_errors: !ignored_error_code(actual_error) && !ignored_error_code(expected_error)) { - slave_print_error(rli, 0, - "\ -Query caused different errors on master and slave. \ + slave_print_msg(ERROR_LEVEL, rli, 0, + "\ +Query caused different errors on master and slave. \ Error on master: '%s' (%d), Error on slave: '%s' (%d). \ Default database: '%s'. Query: '%s'", - ER_SAFE(expected_error), - expected_error, - actual_error ? thd->net.last_error: "no error", - actual_error, - print_slave_db_safe(db), query_arg); + ER_SAFE(expected_error), + expected_error, + actual_error ? thd->net.last_error: "no error", + actual_error, + print_slave_db_safe(db), query_arg); thd->query_error= 1; } /* @@ -1769,11 +1813,11 @@ Default database: '%s'. Query: '%s'", */ else if (thd->query_error || thd->is_fatal_error) { - slave_print_error(rli,actual_error, - "Error '%s' on query. Default database: '%s'. Query: '%s'", - (actual_error ? thd->net.last_error : - "unexpected success or fatal error"), - print_slave_db_safe(thd->db), query_arg); + slave_print_msg(ERROR_LEVEL, rli, actual_error, + "Error '%s' on query. Default database: '%s'. Query: '%s'", + (actual_error ? thd->net.last_error : + "unexpected success or fatal error"), + print_slave_db_safe(thd->db), query_arg); thd->query_error= 1; } @@ -2055,6 +2099,25 @@ Format_description_log_event(uint8 binlog_ver, const char* server_ver) post_header_len[DELETE_FILE_EVENT-1]= DELETE_FILE_HEADER_LEN; post_header_len[NEW_LOAD_EVENT-1]= post_header_len[LOAD_EVENT-1]; post_header_len[FORMAT_DESCRIPTION_EVENT-1]= FORMAT_DESCRIPTION_HEADER_LEN; + post_header_len[TABLE_MAP_EVENT-1]= TABLE_MAP_HEADER_LEN; + post_header_len[WRITE_ROWS_EVENT-1]= ROWS_HEADER_LEN; + post_header_len[UPDATE_ROWS_EVENT-1]= ROWS_HEADER_LEN; + post_header_len[DELETE_ROWS_EVENT-1]= ROWS_HEADER_LEN; + /* + We here have the possibility to simulate a master of before we changed + the table map id to be stored in 6 bytes: when it was stored in 4 + bytes (=> post_header_len was 6). This is used to test backward + compatibility. + This code can be removed after a few months (today is Dec 21st 2005), + when we know that the 4-byte masters are not deployed anymore (check + with Tomas Ulin first!), and the accompanying test (rpl_row_4_bytes) + too. + */ + DBUG_EXECUTE_IF("old_row_based_repl_4_byte_map_id_master", + post_header_len[TABLE_MAP_EVENT-1]= + post_header_len[WRITE_ROWS_EVENT-1]= + post_header_len[UPDATE_ROWS_EVENT-1]= + post_header_len[DELETE_ROWS_EVENT-1]= 6;); post_header_len[BEGIN_LOAD_QUERY_EVENT-1]= post_header_len[APPEND_BLOCK_EVENT-1]; post_header_len[EXECUTE_LOAD_QUERY_EVENT-1]= EXECUTE_LOAD_QUERY_HEADER_LEN; } @@ -2189,10 +2252,8 @@ int Format_description_log_event::exec_event(struct st_relay_log_info* rli) As a transaction NEVER spans on 2 or more binlogs: if we have an active transaction at this point, the master died while writing the transaction to the binary log, i.e. while - flushing the binlog cache to the binlog. As the write was started, - the transaction had been committed on the master, so we lack of - information to replay this transaction on the slave; all we can do - is stop with error. + flushing the binlog cache to the binlog. XA guarantees that master has + rolled back. So we roll back. Note: this event could be sent by the master to inform us of the format of its binlog; in other words maybe it is not at its original place when it comes to us; we'll know this by checking @@ -2200,11 +2261,13 @@ int Format_description_log_event::exec_event(struct st_relay_log_info* rli) */ if (!artificial_event && created && thd->transaction.all.nht) { - slave_print_error(rli, 0, "Rolling back unfinished transaction (no " - "COMMIT or ROLLBACK) from relay log. A probable cause " - "is that the master died while writing the transaction " - "to its binary log."); - end_trans(thd, ROLLBACK); + /* This is not an error (XA is safe), just an information */ + slave_print_msg(INFORMATION_LEVEL, rli, 0, + "Rolling back unfinished transaction (no COMMIT " + "or ROLLBACK in relay log). A probable cause is that " + "the master died while writing the transaction to " + "its binary log, thus rolled back too."); + rli->cleanup_context(thd, 1); } #endif /* @@ -2751,6 +2814,9 @@ int Load_log_event::exec_event(NET* net, struct st_relay_log_info* rli, thd->query_length= 0; // Should not be needed thd->query_error= 0; clear_all_errors(thd, rli); + + /* see Query_log_event::exec_event() and BUG#13360 */ + DBUG_ASSERT(!rli->m_table_map.count()); /* Usually mysql_init_query() is called by mysql_parse(), but we need it here as the present method does not call mysql_parse(). @@ -2962,9 +3028,9 @@ error: sql_errno=ER_UNKNOWN_ERROR; err=ER(sql_errno); } - slave_print_error(rli,sql_errno,"\ + slave_print_msg(ERROR_LEVEL, rli, sql_errno,"\ Error '%s' running LOAD DATA INFILE on table '%s'. Default database: '%s'", - err, (char*)table_name, print_slave_db_safe(save_db)); + err, (char*)table_name, print_slave_db_safe(save_db)); free_root(thd->mem_root,MYF(MY_KEEP_PREALLOC)); return 1; } @@ -2972,9 +3038,9 @@ Error '%s' running LOAD DATA INFILE on table '%s'. Default database: '%s'", if (thd->is_fatal_error) { - slave_print_error(rli,ER_UNKNOWN_ERROR, "\ + slave_print_msg(ERROR_LEVEL, rli, ER_UNKNOWN_ERROR, "\ Fatal error running LOAD DATA INFILE on table '%s'. Default database: '%s'", - (char*)table_name, print_slave_db_safe(save_db)); + (char*)table_name, print_slave_db_safe(save_db)); return 1; } @@ -3035,8 +3101,7 @@ void Rotate_log_event::print(FILE* file, PRINT_EVENT_INFO* print_event_info) #ifndef MYSQL_CLIENT -Rotate_log_event::Rotate_log_event(THD* thd_arg, - const char* new_log_ident_arg, +Rotate_log_event::Rotate_log_event(const char* new_log_ident_arg, uint ident_len_arg, ulonglong pos_arg, uint flags_arg) :Log_event(), new_log_ident(new_log_ident_arg), @@ -3045,7 +3110,7 @@ Rotate_log_event::Rotate_log_event(THD* thd_arg, { #ifndef DBUG_OFF char buff[22]; - DBUG_ENTER("Rotate_log_event::Rotate_log_event(THD*,...)"); + DBUG_ENTER("Rotate_log_event::Rotate_log_event(...,flags)"); DBUG_PRINT("enter",("new_log_ident %s pos %s flags %lu", new_log_ident_arg, llstr(pos_arg, buff), flags)); #endif @@ -3353,12 +3418,24 @@ int Rand_log_event::exec_event(struct st_relay_log_info* rli) Xid_log_event methods **************************************************************************/ +#if !defined(DBUG_OFF) && !defined(MYSQL_CLIENT) +/* + This static class member could be removed when mysqltest is made to support + a --replace-regex command: then tests which have XIDs in their output can + use this command to suppress non-deterministic XID values. +*/ +my_bool Xid_log_event::show_xid; +#endif + #if defined(HAVE_REPLICATION) && !defined(MYSQL_CLIENT) void Xid_log_event::pack_info(Protocol *protocol) { char buf[128], *pos; pos= strmov(buf, "COMMIT /* xid="); - pos= longlong10_to_str(xid, pos, 10); +#if !defined(DBUG_OFF) && !defined(MYSQL_CLIENT) + if (show_xid) +#endif + pos= longlong10_to_str(xid, pos, 10); pos= strmov(pos, " */"); protocol->store(buf, (uint) (pos-buf), &my_charset_bin); } @@ -4179,7 +4256,8 @@ int Create_file_log_event::exec_event(struct st_relay_log_info* rli) init_io_cache(&file, fd, IO_SIZE, WRITE_CACHE, (my_off_t)0, 0, MYF(MY_WME|MY_NABP))) { - slave_print_error(rli,my_errno, "Error in Create_file event: could not open file '%s'", fname_buf); + slave_print_msg(ERROR_LEVEL, rli, my_errno, "Error in Create_file event: " + "could not open file '%s'", fname_buf); goto err; } @@ -4190,9 +4268,9 @@ int Create_file_log_event::exec_event(struct st_relay_log_info* rli) if (write_base(&file)) { strmov(p, ".info"); // to have it right in the error message - slave_print_error(rli,my_errno, - "Error in Create_file event: could not write to file '%s'", - fname_buf); + slave_print_msg(ERROR_LEVEL, rli, my_errno, + "Error in Create_file event: could not write to file '%s'", + fname_buf); goto err; } end_io_cache(&file); @@ -4204,12 +4282,14 @@ int Create_file_log_event::exec_event(struct st_relay_log_info* rli) O_WRONLY | O_BINARY | O_EXCL | O_NOFOLLOW, MYF(MY_WME))) < 0) { - slave_print_error(rli,my_errno, "Error in Create_file event: could not open file '%s'", fname_buf); + slave_print_msg(ERROR_LEVEL, rli, my_errno, "Error in Create_file event: " + "could not open file '%s'", fname_buf); goto err; } if (my_write(fd, (byte*) block, block_len, MYF(MY_WME+MY_NABP))) { - slave_print_error(rli,my_errno, "Error in Create_file event: write to '%s' failed", fname_buf); + slave_print_msg(ERROR_LEVEL, rli, my_errno, "Error in Create_file event: " + "write to '%s' failed", fname_buf); goto err; } error=0; // Everything is ok @@ -4348,25 +4428,25 @@ int Append_block_log_event::exec_event(struct st_relay_log_info* rli) O_WRONLY | O_BINARY | O_EXCL | O_NOFOLLOW, MYF(MY_WME))) < 0) { - slave_print_error(rli, my_errno, - "Error in %s event: could not create file '%s'", - get_type_str(), fname); + slave_print_msg(ERROR_LEVEL, rli, my_errno, + "Error in %s event: could not create file '%s'", + get_type_str(), fname); goto err; } } else if ((fd = my_open(fname, O_WRONLY | O_APPEND | O_BINARY | O_NOFOLLOW, MYF(MY_WME))) < 0) { - slave_print_error(rli, my_errno, - "Error in %s event: could not open file '%s'", - get_type_str(), fname); + slave_print_msg(ERROR_LEVEL, rli, my_errno, + "Error in %s event: could not open file '%s'", + get_type_str(), fname); goto err; } if (my_write(fd, (byte*) block, block_len, MYF(MY_WME+MY_NABP))) { - slave_print_error(rli, my_errno, - "Error in %s event: write to '%s' failed", - get_type_str(), fname); + slave_print_msg(ERROR_LEVEL, rli, my_errno, + "Error in %s event: write to '%s' failed", + get_type_str(), fname); goto err; } error=0; @@ -4573,7 +4653,8 @@ int Execute_load_log_event::exec_event(struct st_relay_log_info* rli) init_io_cache(&file, fd, IO_SIZE, READ_CACHE, (my_off_t)0, 0, MYF(MY_WME|MY_NABP))) { - slave_print_error(rli,my_errno, "Error in Exec_load event: could not open file '%s'", fname); + slave_print_msg(ERROR_LEVEL, rli, my_errno, "Error in Exec_load event: " + "could not open file '%s'", fname); goto err; } if (!(lev = (Load_log_event*)Log_event::read_log_event(&file, @@ -4581,7 +4662,8 @@ int Execute_load_log_event::exec_event(struct st_relay_log_info* rli) rli->relay_log.description_event_for_exec)) || lev->get_type_code() != NEW_LOAD_EVENT) { - slave_print_error(rli,0, "Error in Exec_load event: file '%s' appears corrupted", fname); + slave_print_msg(ERROR_LEVEL, rli, 0, "Error in Exec_load event: " + "file '%s' appears corrupted", fname); goto err; } @@ -4607,10 +4689,10 @@ int Execute_load_log_event::exec_event(struct st_relay_log_info* rli) char *tmp= my_strdup(rli->last_slave_error,MYF(MY_WME)); if (tmp) { - slave_print_error(rli, - rli->last_slave_errno, /* ok to re-use error code */ - "%s. Failed executing load from '%s'", - tmp, fname); + slave_print_msg(ERROR_LEVEL, rli, + rli->last_slave_errno, /* ok to re-use error code */ + "%s. Failed executing load from '%s'", + tmp, fname); my_free(tmp,MYF(0)); } goto err; @@ -4816,7 +4898,7 @@ Execute_load_query_log_event::exec_event(struct st_relay_log_info* rli) if (!(buf = my_malloc(q_len + 1 - (fn_pos_end - fn_pos_start) + (FN_REFLEN + 10) + 10 + 8 + 5, MYF(MY_WME)))) { - slave_print_error(rli, my_errno, "Not enough memory"); + slave_print_msg(ERROR_LEVEL, rli, my_errno, "Not enough memory"); return 1; } @@ -4942,3 +5024,1727 @@ char* sql_ex_info::init(char* buf,char* buf_end,bool use_new_format) } return buf; } + + +#ifdef HAVE_ROW_BASED_REPLICATION + +/************************************************************************** + Rows_log_event member functions +**************************************************************************/ + +#ifndef MYSQL_CLIENT +Rows_log_event::Rows_log_event(THD *thd_arg, TABLE *tbl_arg, ulong tid, + MY_BITMAP const *cols, bool is_transactional) + : Log_event(thd_arg, 0, is_transactional), + m_table(tbl_arg), + m_table_id(tid), + m_width(tbl_arg->s->fields), + m_rows_buf(my_malloc(opt_binlog_rows_event_max_size * sizeof(*m_rows_buf), MYF(MY_WME))), + m_rows_cur(m_rows_buf), + m_rows_end(m_rows_buf + opt_binlog_rows_event_max_size), + m_flags(0) +{ + DBUG_ASSERT(m_table && m_table->s); + DBUG_ASSERT(m_table_id != ULONG_MAX); + + if (thd_arg->options & OPTION_NO_FOREIGN_KEY_CHECKS) + set_flags(NO_FOREIGN_KEY_CHECKS_F); + if (thd_arg->options & OPTION_RELAXED_UNIQUE_CHECKS) + set_flags(RELAXED_UNIQUE_CHECKS_F); + /* if bitmap_init fails, catched in is_valid() */ + if (likely(!bitmap_init(&m_cols, + m_width <= sizeof(m_bitbuf)*8 ? m_bitbuf : NULL, + (m_width + 7) & ~7UL, + false))) + memcpy(m_cols.bitmap, cols->bitmap, no_bytes_in_map(cols)); + else + m_cols.bitmap= 0; // to not free it +} +#endif + +Rows_log_event::Rows_log_event(const char *buf, uint event_len, + Log_event_type event_type, + const Format_description_log_event + *description_event) + : Log_event(buf, description_event), + m_rows_buf(0), m_rows_cur(0), m_rows_end(0) +{ + DBUG_ENTER("Rows_log_event::Rows_log_event(const char*,...)"); + uint8 const common_header_len= description_event->common_header_len; + uint8 const post_header_len= description_event->post_header_len[event_type-1]; + + DBUG_PRINT("enter",("event_len=%ld, common_header_len=%d, " + "post_header_len=%d", + event_len, common_header_len, + post_header_len)); + + const char *post_start= buf + common_header_len; + post_start+= RW_MAPID_OFFSET; + if (post_header_len == 6) + { + /* Master is of an intermediate source tree before 5.1.4. Id is 4 bytes */ + m_table_id= uint4korr(post_start); + post_start+= 4; + } + else + { + m_table_id= uint6korr(post_start); + post_start+= RW_FLAGS_OFFSET; + } + + DBUG_ASSERT(m_table_id != ULONG_MAX); + + m_flags= uint2korr(post_start); + + byte const *const var_start= buf + common_header_len + post_header_len; + byte const *const ptr_width= var_start; + byte const *const ptr_after_width= my_vle_decode(&m_width, ptr_width); + + const uint byte_count= (m_width + 7) / 8; + const char* const ptr_rows_data= var_start + byte_count + 1; + + my_size_t const data_size= event_len - (ptr_rows_data - buf); + DBUG_PRINT("info",("m_table_id=%lu, m_flags=%d, m_width=%u, data_size=%lu", + m_table_id, m_flags, m_width, data_size)); + + m_rows_buf= my_malloc(data_size, MYF(MY_WME)); + if (likely((bool)m_rows_buf)) + { + /* if bitmap_init fails, catched in is_valid() */ + if (likely(!bitmap_init(&m_cols, + m_width <= sizeof(m_bitbuf)*8 ? m_bitbuf : NULL, + (m_width + 7) & ~7UL, + false))) + memcpy(m_cols.bitmap, ptr_after_width, byte_count); + m_rows_end= m_rows_buf + data_size; + m_rows_cur= m_rows_end; + memcpy(m_rows_buf, ptr_rows_data, data_size); + } + else + m_cols.bitmap= 0; // to not free it + + DBUG_VOID_RETURN; +} + +Rows_log_event::~Rows_log_event() +{ + if (m_cols.bitmap == m_bitbuf) // no my_malloc happened + m_cols.bitmap= 0; // so no my_free in bitmap_free + bitmap_free(&m_cols); // To pair with bitmap_init(). + my_free(m_rows_buf, MYF(MY_ALLOW_ZERO_PTR)); +} + +#ifndef MYSQL_CLIENT +int Rows_log_event::do_add_row_data(byte *const row_data, + my_size_t const length) +{ + /* + When the table has a primary key, we would probably want, by default, to + log only the primary key value instead of the entire "before image". This + would save binlog space. TODO + */ + DBUG_ENTER("Rows_log_event::do_add_row_data(byte *data, my_size_t length)"); + DBUG_PRINT("enter", ("row_data= %p, length= %lu", row_data, length)); + DBUG_DUMP("row_data", row_data, min(length, 32)); + + DBUG_ASSERT(m_rows_buf <= m_rows_cur); + DBUG_ASSERT(m_rows_buf < m_rows_end); + DBUG_ASSERT(m_rows_cur <= m_rows_end); + + /* The cast will always work since m_rows_cur <= m_rows_end */ + if (static_cast<my_size_t>(m_rows_end - m_rows_cur) < length) + { + my_size_t const block_size= 1024; + my_ptrdiff_t const old_alloc= m_rows_end - m_rows_buf; + my_ptrdiff_t const new_alloc= + old_alloc + block_size * (length / block_size + block_size - 1); + my_ptrdiff_t const cur_size= m_rows_cur - m_rows_buf; + + byte* const new_buf= my_realloc(m_rows_buf, new_alloc, MYF(MY_WME)); + if (unlikely(!new_buf)) + DBUG_RETURN(HA_ERR_OUT_OF_MEM); + + /* If the memory moved, we need to move the pointers */ + if (new_buf != m_rows_buf) + { + m_rows_buf= new_buf; + m_rows_cur= m_rows_buf + cur_size; + } + + /* + The end pointer should always be changed to point to the end of + the allocated memory. + */ + m_rows_end= m_rows_buf + new_alloc; + } + + DBUG_ASSERT(m_rows_cur + length < m_rows_end); + memcpy(m_rows_cur, row_data, length); + m_rows_cur+= length; + DBUG_RETURN(0); +} +#endif + +#if !defined(MYSQL_CLIENT) && defined(HAVE_REPLICATION) +/* + Unpack a row into a record. The row is assumed to only consist of the fields + for which the bitset represented by 'arr' and 'bits'; the other parts of the + record are left alone. + */ +static char const *unpack_row(TABLE *table, + char *record, char const *row, + MY_BITMAP const *cols) +{ + DBUG_ASSERT(record && row); + + MY_BITMAP *write_set= table->file->write_set; + my_size_t const n_null_bytes= table->s->null_bytes; + my_ptrdiff_t const offset= record - (byte*) table->record[0]; + + memcpy(record, row, n_null_bytes); + char const *ptr= row + n_null_bytes; + + bitmap_set_all(write_set); + Field **const begin_ptr = table->field; + for (Field **field_ptr= begin_ptr ; *field_ptr ; ++field_ptr) + { + Field *const f= *field_ptr; + + if (bitmap_is_set(cols, field_ptr - begin_ptr)) + { + /* Field...::unpack() cannot return 0 */ + ptr= f->unpack(f->ptr + offset, ptr); + } + else + bitmap_clear_bit(write_set, (field_ptr - begin_ptr) + 1); + } + return ptr; +} + +int Rows_log_event::exec_event(st_relay_log_info *rli) +{ + DBUG_ENTER("Rows_log_event::exec_event(st_relay_log_info*)"); + DBUG_ASSERT(m_table_id != ULONG_MAX); + int error= 0; + char const *row_start= m_rows_buf; + TABLE* table= rli->m_table_map.get_table(m_table_id); + + /* + 'thd' has been set by exec_relay_log_event(), just before calling + exec_event(). We still check here to prevent future coding errors. + */ + DBUG_ASSERT(rli->sql_thd == thd); + + /* + lock_tables() reads the contents of thd->lex, so they must be + initialized, so we should call lex_start(); to be even safer, we call + mysql_init_query() which does a more complete set of inits. + */ + mysql_init_query(thd, NULL, 0); + + if (table) + { + /* + table == NULL means that this table should not be + replicated (this was set up by Table_map_log_event::exec_event() which + tested replicate-* rules). + */ + TABLE_LIST table_list; + bool need_reopen; + uint count= 1; + bzero(&table_list, sizeof(table_list)); + table_list.lock_type= TL_WRITE; + table_list.next_global= table_list.next_local= 0; + table_list.table= table; + + for ( ; ; ) + { + table_list.db= const_cast<char*>(table->s->db.str); + table_list.alias= table_list.table_name= + const_cast<char*>(table->s->table_name.str); + + if ((error= lock_tables(thd, &table_list, count, &need_reopen)) == 0) + break; + if (!need_reopen) + { + slave_print_msg(ERROR_LEVEL, rli, error, + "Error in %s event: error during table %s.%s lock", + get_type_str(), table->s->db, table->s->table_name); + DBUG_RETURN(error); + } + /* + we need to store a local copy of the table names since the table object + will become invalid after close_tables_for_reopen + */ + char *db= my_strdup(table->s->db.str, MYF(MY_WME)); + char *table_name= my_strdup(table->s->table_name.str, MYF(MY_WME)); + + if (db == 0 || table_name == 0) + { + /* + Since the lock_tables() failed, the table is not locked, so + we don't need to unlock them. + */ + DBUG_RETURN(HA_ERR_OUT_OF_MEM); + } + + /* + We also needs to flush the pending RBR event, since it keeps a + pointer to an open table. + + ALTERNATIVE SOLUTION: Extract a pointer to the pending RBR + event and reset the table pointer after the tables has been + reopened. + */ + thd->binlog_flush_pending_rows_event(false); + + close_tables_for_reopen(thd, &table_list); + + /* open the table again, same as in Table_map_event::exec_event */ + table_list.db= const_cast<char*>(db); + table_list.alias= table_list.table_name= const_cast<char*>(table_name); + table_list.updating= 1; + TABLE_LIST *tables= &table_list; + if ((error= open_tables(thd, &tables, &count, 0)) == 0) + { + /* reset some variables for the table list*/ + table_list.updating= 0; + /* retrieve the new table reference and update the table map */ + table= table_list.table; + error= rli->m_table_map.set_table(m_table_id, table); + } + else /* error in open_tables */ + { + if (thd->query_error || thd->is_fatal_error) + { + /* + Error reporting borrowed from Query_log_event with many excessive + simplifications (we don't honour --slave-skip-errors) + */ + uint actual_error= thd->net.last_errno; + slave_print_msg(ERROR_LEVEL, rli, actual_error, + "Error '%s' on reopening table `%s`.`%s`", + (actual_error ? thd->net.last_error : + "unexpected success or fatal error"), + db, table_name); + thd->query_error= 1; + } + } + my_free((char*) db, MYF(MY_ALLOW_ZERO_PTR)); + my_free((char*) table_name, MYF(MY_ALLOW_ZERO_PTR)); + + if (error) + DBUG_RETURN(error); + } + + /* + It's not needed to set_time() but + 1) it continues the property that "Time" in SHOW PROCESSLIST shows how + much slave is behind + 2) it will be needed when we allow replication from a table with no + TIMESTAMP column to a table with one. + So we call set_time(), like in SBR. Presently it changes nothing. + */ + thd->set_time((time_t)when); + /* + There are a few flags that are replicated with each row event. + Make sure to set/clear them before executing the main body of + the event. + */ + if (get_flags(NO_FOREIGN_KEY_CHECKS_F)) + thd->options|= OPTION_NO_FOREIGN_KEY_CHECKS; + else + thd->options&= ~OPTION_NO_FOREIGN_KEY_CHECKS; + + if (get_flags(RELAXED_UNIQUE_CHECKS_F)) + thd->options|= OPTION_RELAXED_UNIQUE_CHECKS; + else + thd->options&= ~OPTION_RELAXED_UNIQUE_CHECKS; + /* A small test to verify that objects have consistent types */ + DBUG_ASSERT(sizeof(thd->options) == sizeof(OPTION_RELAXED_UNIQUE_CHECKS)); + + error= do_before_row_operations(table); + while (error == 0 && row_start < m_rows_end) { + char const *row_end= do_prepare_row(thd, table, row_start); + DBUG_ASSERT(row_end != NULL); // cannot happen + DBUG_ASSERT(row_end <= m_rows_end); + + /* in_use can have been set to NULL in close_tables_for_reopen */ + THD* old_thd= table->in_use; + if (!table->in_use) + table->in_use= thd; + error= do_exec_row(table); + table->in_use = old_thd; + switch (error) + { + /* Some recoverable errors */ + case HA_ERR_RECORD_CHANGED: + case HA_ERR_KEY_NOT_FOUND: /* Idempotency support: OK if + tuple does not exist */ + error= 0; + case 0: + break; + + default: + slave_print_msg(ERROR_LEVEL, rli, error, + "Error in %s event: row application failed", + get_type_str()); + thd->query_error= 1; + break; + } + + row_start= row_end; + } + DBUG_EXECUTE_IF("STOP_SLAVE_after_first_Rows_event", + rli->abort_slave=1;); + error= do_after_row_operations(table, error); + if (!cache_stmt) + thd->options|= OPTION_STATUS_NO_TRANS_UPDATE; + + } + + if (error) + { /* error has occured during the transaction */ + /* + If one day we honour --skip-slave-errors in row-based replication, and + the error should be skipped, then we would clear mappings, rollback, + close tables, but the slave SQL thread would not stop and then may + assume the mapping is still available, the tables are still open... + So then we should clear mappings/rollback/close here only if this is a + STMT_END_F. + For now we code, knowing that error is not skippable and so slave SQL + thread is certainly going to stop. + */ + rli->cleanup_context(thd, 1); + thd->query_error= 1; + DBUG_RETURN(error); + } + + if (get_flags(STMT_END_F)) + { + /* + This is the end of a statement or transaction, so close (and + unlock) the tables we opened when processing the + Table_map_log_event starting the statement. + + OBSERVER. This will clear *all* mappings, not only those that + are open for the table. There is not good handle for on-close + actions for tables. + + NOTE. Even if we have no table ('table' == 0) we still need to be + here, so that we increase the group relay log position. If we didn't, we + could have a group relay log position which lags behind "forever" + (assume the last master's transaction is ignored by the slave because of + replicate-ignore rules). + */ + thd->binlog_flush_pending_rows_event(true); + /* + If this event is not in a transaction, the call below will, if some + transactional storage engines are involved, commit the statement into + them and flush the pending event to binlog. + If this event is in a transaction, the call will do nothing, but a + Xid_log_event will come next which will, if some transactional engines + are involved, commit the transaction and flush the pending event to the + binlog. + */ + error= ha_autocommit_or_rollback(thd, 0); + /* + Now what if this is not a transactional engine? we still need to + flush the pending event to the binlog; we did it with + thd->binlog_flush_pending_rows_event(). Note that we imitate + what is done for real queries: a call to + ha_autocommit_or_rollback() (sometimes only if involves a + transactional engine), and a call to be sure to have the pending + event flushed. + */ + + rli->cleanup_context(thd, 0); + rli->transaction_end(thd); + + if (error == 0) + { + /* + Clear any errors pushed in thd->net.last_err* if for example "no key + found" (as this is allowed). This is a safety measure; apparently + those errors (e.g. when executing a Delete_rows_log_event of a + non-existing row, like in rpl_row_mystery22.test, + thd->net.last_error = "Can't find record in 't1'" and last_errno=1032) + do not become visible. We still prefer to wipe them out. + */ + thd->clear_error(); + error= Log_event::exec_event(rli); + } + else + slave_print_msg(ERROR_LEVEL, rli, error, + "Error in %s event: commit of row events failed, " + "table `%s`.`%s`", + get_type_str(), table->s->db, table->s->table_name); + DBUG_RETURN(error); + } + + if (table) + { + /* + As "table" is not NULL, we did a successful lock_tables(), without any + prior LOCK TABLES and are not in prelocked mode, so this assertion should + be true. + */ + DBUG_ASSERT(thd->lock); + /* + If we are here, there are more events to come which may use our mappings + and our table. So don't clear mappings or close tables, just unlock + tables. + Why don't we lock the table once for all in + Table_map_log_event::exec_event() ? Because we could have in binlog: + BEGIN; + Table_map t1 -> 1 + Write_rows to id 1 + Table_map t2 -> 2 + Write_rows to id 2 + Xid_log_event + So we cannot lock t1 when executing the first Table_map, because at that + moment we don't know we'll also have to lock t2, and all tables must be + locked at once in MySQL. + */ + mysql_unlock_tables(thd, thd->lock); + thd->lock= 0; + if ((table->s->primary_key == MAX_KEY) && + !cache_stmt) + { + /* + ------------ Temporary fix until WL#2975 is implemented --------- + This event is not the last one (no STMT_END_F). If we stop now (in + case of terminate_slave_thread()), how will we restart? We have to + restart from Table_map_log_event, but as this table is not + transactional, the rows already inserted will still be present, and + idempotency is not guaranteed (no PK) so we risk that repeating leads + to double insert. So we desperately try to continue, hope we'll + eventually leave this buggy situation (by executing the final + Rows_log_event). If we are in a hopeless wait (reached end of last + relay log and nothing gets appended there), we timeout after one + minute, and notify DBA about the problem. + When WL#2975 is implemented, just remove the member + st_relay_log_info::unsafe_to_stop_at and all its occurences. + */ + rli->unsafe_to_stop_at= time(0); + } + } + + DBUG_ASSERT(error == 0); + thd->clear_error(); + rli->inc_event_relay_log_pos(); + + DBUG_RETURN(0); +} +#endif /* !defined(MYSQL_CLIENT) && defined(HAVE_REPLICATION) */ + +#ifndef MYSQL_CLIENT +bool Rows_log_event::write_data_header(IO_CACHE *file) +{ + DBUG_ASSERT(m_table_id != ULONG_MAX); + byte buf[ROWS_HEADER_LEN]; // No need to init the buffer + DBUG_EXECUTE_IF("old_row_based_repl_4_byte_map_id_master", + { + int4store(buf + 0, m_table_id); + int2store(buf + 4, m_flags); + return (my_b_safe_write(file, buf, 6)); + }); + int6store(buf + RW_MAPID_OFFSET, (ulonglong)m_table_id); + int2store(buf + RW_FLAGS_OFFSET, m_flags); + return (my_b_safe_write(file, buf, ROWS_HEADER_LEN)); +} + +bool Rows_log_event::write_data_body(IO_CACHE*file) +{ + /* + Note that this should be the number of *bits*, not the number of + bytes. + */ + byte sbuf[my_vle_sizeof(m_width)]; + my_ptrdiff_t const data_size= m_rows_cur - m_rows_buf; + + char *const sbuf_end= my_vle_encode(sbuf, sizeof(sbuf), m_width); + DBUG_ASSERT(static_cast<my_size_t>(sbuf_end - sbuf) <= sizeof(sbuf)); + + return (my_b_safe_write(file, sbuf, sbuf_end - sbuf) || + my_b_safe_write(file, reinterpret_cast<byte*>(m_cols.bitmap), + no_bytes_in_map(&m_cols)) || + my_b_safe_write(file, m_rows_buf, data_size)); +} +#endif + +#if defined(HAVE_REPLICATION) && !defined(MYSQL_CLIENT) && defined(DBUG_RBR) +void Rows_log_event::pack_info(Protocol *protocol) +{ + char buf[256]; + char const *const flagstr= get_flags(STMT_END_F) ? "STMT_END_F" : ""; + char const *const dbnam= m_table->s->db; + char const *const tblnam= m_table->s->table_name; + my_size_t bytes= snprintf(buf, sizeof(buf), + "%s.%s - %s", dbnam, tblnam, flagstr); + protocol->store(buf, bytes, &my_charset_bin); +} +#endif + +/************************************************************************** + Table_map_log_event member functions +**************************************************************************/ + +/* + Constructor used to build an event for writing to the binary log. + Mats says tbl->s lives longer than this event so it's ok to copy pointers + (tbl->s->db etc) and not pointer content. + */ +#if !defined(MYSQL_CLIENT) +Table_map_log_event::Table_map_log_event(THD *thd, TABLE *tbl, ulong tid, + bool is_transactional, uint16 flags) + : Log_event(thd, 0, is_transactional), + m_table(tbl), + m_dbnam(tbl->s->db.str), + m_dblen(m_dbnam ? tbl->s->db.length : 0), + m_tblnam(tbl->s->table_name.str), + m_tbllen(tbl->s->table_name.length), + m_colcnt(tbl->s->fields), m_coltype(0), + m_table_id(tid), + m_flags(flags) +{ + DBUG_ASSERT(m_table_id != ULONG_MAX); + /* + In TABLE_SHARE, "db" and "table_name" are 0-terminated (see this comment in + table.cc / alloc_table_share(): + Use the fact the key is db/0/table_name/0 + As we rely on this let's assert it. + */ + DBUG_ASSERT((tbl->s->db.str == 0) || + (tbl->s->db.str[tbl->s->db.length] == 0)); + DBUG_ASSERT(tbl->s->table_name.str[tbl->s->table_name.length] == 0); + + + m_data_size= TABLE_MAP_HEADER_LEN; + DBUG_EXECUTE_IF("old_row_based_repl_4_byte_map_id_master", m_data_size= 6;) + m_data_size+= m_dblen + 2; // Include length and terminating \0 + m_data_size+= m_tbllen + 2; // Include length and terminating \0 + m_data_size+= 1 + m_colcnt; // COLCNT and column types + + /* If malloc fails, catched in is_valid() */ + if ((m_memory= my_malloc(m_colcnt, MYF(MY_WME)))) + { + m_coltype= reinterpret_cast<unsigned char*>(m_memory); + for (unsigned int i= 0 ; i < m_table->s->fields ; ++i) + m_coltype[i]= m_table->field[i]->type(); + } +} +#endif /* !defined(MYSQL_CLIENT) */ + +/* + Constructor used by slave to read the event from the binary log. + */ +#if defined(HAVE_REPLICATION) +Table_map_log_event::Table_map_log_event(const char *buf, uint event_len, + const Format_description_log_event + *description_event) + + : Log_event(buf, description_event), +#ifndef MYSQL_CLIENT + m_table(NULL), +#endif + m_memory(NULL) +{ + DBUG_ENTER("Table_map_log_event::Table_map_log_event(const char*,uint,...)"); + + uint8 common_header_len= description_event->common_header_len; + uint8 post_header_len= description_event->post_header_len[TABLE_MAP_EVENT-1]; + DBUG_PRINT("info",("event_len=%ld, common_header_len=%d, post_header_len=%d", + event_len, common_header_len, post_header_len)); + + DBUG_DUMP("event buffer", buf, event_len); + + /* Read the post-header */ + const char *post_start= buf + common_header_len; + + post_start+= TM_MAPID_OFFSET; + if (post_header_len == 6) + { + /* Master is of an intermediate source tree before 5.1.4. Id is 4 bytes */ + m_table_id= uint4korr(post_start); + post_start+= 4; + } + else + { + DBUG_ASSERT(post_header_len == TABLE_MAP_HEADER_LEN); + m_table_id= uint6korr(post_start); + post_start+= TM_FLAGS_OFFSET; + } + + DBUG_ASSERT(m_table_id != ULONG_MAX); + + m_flags= uint2korr(post_start); + + /* Read the variable part of the event */ + const char *const vpart= buf + common_header_len + post_header_len; + + /* Extract the length of the various parts from the buffer */ + byte const* const ptr_dblen= vpart + 0; + m_dblen= *(unsigned char*) ptr_dblen; + + /* Length of database name + counter + terminating null */ + byte const* const ptr_tbllen= ptr_dblen + m_dblen + 2; + m_tbllen= *(unsigned char*) ptr_tbllen; + + /* Length of table name + counter + terminating null */ + byte const* const ptr_colcnt= ptr_tbllen + m_tbllen + 2; + byte const* const ptr_after_colcnt= my_vle_decode(&m_colcnt, ptr_colcnt); + + DBUG_PRINT("info",("m_dblen=%d off=%d m_tbllen=%d off=%d m_colcnt=%d off=%d", + m_dblen, ptr_dblen-vpart, m_tbllen, ptr_tbllen-vpart, + m_colcnt, ptr_colcnt-vpart)); + + /* Allocate mem for all fields in one go. If fails, catched in is_valid() */ + m_memory= my_multi_malloc(MYF(MY_WME), + &m_dbnam, m_dblen + 1, + &m_tblnam, m_tbllen + 1, + &m_coltype, m_colcnt, + NULL); + + if (m_memory) + { + /* Copy the different parts into their memory */ + strncpy(const_cast<char*>(m_dbnam), ptr_dblen + 1, m_dblen + 1); + strncpy(const_cast<char*>(m_tblnam), ptr_tbllen + 1, m_tbllen + 1); + memcpy(m_coltype, ptr_after_colcnt, m_colcnt); + } + + DBUG_VOID_RETURN; +} +#endif + +Table_map_log_event::~Table_map_log_event() +{ + my_free(m_memory, MYF(MY_ALLOW_ZERO_PTR)); +} + +/* + Find a table based on database name and table name. + + DESCRIPTION + + Currently, only the first table of the 'table_list' is located. If the + table is found in the list of open tables for the thread, the 'table' + field of 'table_list' is filled in. + + PARAMETERS + + thd Thread structure + table_list List of tables to locate in the thd->open_tables list. + count Pointer to a variable that will be set to the number of + tables found. If the pointer is NULL, nothing will be stored. + + RETURN VALUE + + The number of tables found. + + TO DO + + Replace the list of table searches with a hash based on the combined + database and table name. The handler_tables_hash is inappropriate since + it hashes on the table alias. At the same time, the function can be + extended to handle a full list of table names, in the same spirit as + open_tables() and lock_tables(). +*/ +#if !defined(MYSQL_CLIENT) && defined(HAVE_REPLICATION) +static uint find_tables(THD *thd, TABLE_LIST *table_list, uint *count) +{ + uint result= 0; + + /* we verify that the caller knows our limitation */ + DBUG_ASSERT(table_list->next_global == 0); + for (TABLE *table= thd->open_tables; table ; table= table->next) + { + if (strcmp(table->s->db.str, table_list->db) == 0 + && strcmp(table->s->table_name.str, table_list->table_name) == 0) + { + /* Copy the table pointer into the table list. */ + table_list->table= table; + result= 1; + break; + } + } + + if (count) + *count= result; + return result; +} +#endif + +/* + Return value is an error code, one of: + + -1 Failure to open table [from open_tables()] + 0 Success + 1 No room for more tables [from set_table()] + 2 Out of memory [from set_table()] + 3 Wrong table definition + 4 Daisy-chaining RBR with SBR not possible + */ + +#if !defined(MYSQL_CLIENT) && defined(HAVE_REPLICATION) +int Table_map_log_event::exec_event(st_relay_log_info *rli) +{ + DBUG_ENTER("Table_map_log_event::exec_event(st_relay_log_info*)"); + + DBUG_ASSERT(rli->sql_thd == thd); + + /* Step the query id to mark what columns that are actually used. */ + pthread_mutex_lock(&LOCK_thread_count); + thd->query_id= next_query_id(); + pthread_mutex_unlock(&LOCK_thread_count); + + TABLE_LIST table_list; + uint32 dummy_len; + bzero(&table_list, sizeof(table_list)); + table_list.db= const_cast<char *> + (rpl_filter->get_rewrite_db(m_dbnam, &dummy_len)); + table_list.alias= table_list.table_name= const_cast<char*>(m_tblnam); + table_list.lock_type= TL_WRITE; + table_list.next_global= table_list.next_local= 0; + table_list.updating= 1; + + int error= 0; + + if (rpl_filter->db_ok(table_list.db) && + (!rpl_filter->is_on() || rpl_filter->tables_ok("", &table_list))) + { + /* + Check if the slave is set to use SBR. If so, the slave should + stop immediately since it is not possible to daisy-chain from + RBR to SBR. Once RBR is used, the rest of the chain has to use + RBR. + */ + if (mysql_bin_log.is_open() && (thd->options & OPTION_BIN_LOG) && + !binlog_row_based) + { + slave_print_msg(ERROR_LEVEL, rli, ER_BINLOG_ROW_RBR_TO_SBR, + "It is not possible to use statement-based binlogging " + "on a slave that replicates row-based. Please use " + "--binrow-format=row on slave if you want to use " + "--log-slave-updates and read row-based binlog events."); + DBUG_RETURN(ERR_RBR_TO_SBR); + } + + /* + Open the table if it is not already open and add the table to table map. + If the table should not be replicated, we don't bother to do anything. + The table map will return NULL and the row-level event will effectively + be a no-op. + */ + uint count; + if (find_tables(thd, &table_list, &count) == 0) + { + /* + open_tables() reads the contents of thd->lex, so they must be + initialized, so we should call lex_start(); to be even safer, we call + mysql_init_query() which does a more complete set of inits. + */ + mysql_init_query(thd, NULL, 0); + TABLE_LIST *tables= &table_list; + if ((error= open_tables(thd, &tables, &count, 0))) + { + if (thd->query_error || thd->is_fatal_error) + { + /* + Error reporting borrowed from Query_log_event with many excessive + simplifications (we don't honour --slave-skip-errors) + */ + uint actual_error= thd->net.last_errno; + slave_print_msg(ERROR_LEVEL, rli, actual_error, + "Error '%s' on opening table `%s`.`%s`", + (actual_error ? thd->net.last_error : + "unexpected success or fatal error"), + table_list.db, table_list.table_name); + thd->query_error= 1; + } + DBUG_RETURN(error); + } + } + + m_table= table_list.table; + + /* + This will fail later otherwise, the 'in_use' field should be + set to the current thread. + */ + DBUG_ASSERT(m_table->in_use); + + /* + Check that the number of columns and the field types in the + event match the number of columns and field types in the opened + table. + */ + uint col= m_table->s->fields; + + if (col == m_colcnt) + { + while (col-- > 0) + if (m_table->field[col]->type() != m_coltype[col]) + break; + } + + TABLE_SHARE const *const tsh= m_table->s; + + /* + Check the following termination conditions: + + (col == m_table->s->fields) + ==> (m_table->s->fields != m_colcnt) + (0 <= col < m_table->s->fields) + ==> (m_table->field[col]->type() != m_coltype[col]) + + Logically, A ==> B is equivalent to !A || B + + Since col is unsigned, is suffices to check that col <= + tsh->fields. If col wrapped (by decreasing col when it is 0), + the number will be UINT_MAX, which is greater than tsh->fields. + */ + DBUG_ASSERT(!(col == tsh->fields) || tsh->fields != m_colcnt); + DBUG_ASSERT(!(col < tsh->fields) || + (m_table->field[col]->type() != m_coltype[col])); + + if (col <= tsh->fields) + { + /* + If we get here, the number of columns in the event didn't + match the number of columns in the table on the slave, *or* + there were a column in the table on the slave that did not + have the same type as given in the event. + + If 'col' has the value that was assigned to it, it was a + mismatch between the number of columns on the master and the + slave. + */ + if (col == tsh->fields) + { + DBUG_ASSERT(tsh->db.str && tsh->table_name.str); + slave_print_msg(ERROR_LEVEL, rli, ER_BINLOG_ROW_WRONG_TABLE_DEF, + "Table width mismatch - " + "received %u columns, %s.%s has %u columns", + m_colcnt, tsh->db.str, tsh->table_name.str, tsh->fields); + } + else + { + DBUG_ASSERT(col < m_colcnt && col < tsh->fields); + DBUG_ASSERT(tsh->db.str && tsh->table_name.str); + slave_print_msg(ERROR_LEVEL, rli, ER_BINLOG_ROW_WRONG_TABLE_DEF, + "Column %d type mismatch - " + "received type %d, %s.%s has type %d", + col, m_coltype[col], tsh->db.str, tsh->table_name.str, + m_table->field[col]->type()); + } + + thd->query_error= 1; + DBUG_RETURN(ERR_BAD_TABLE_DEF); + } + + /* + We record in the slave's information that the number m_table_id is + mapped to the m_table object + */ + if (!error) + error= rli->m_table_map.set_table(m_table_id, m_table); + + /* + Tell the RLI that we are touching a table. + + TODO: Maybe we can combine this with the previous operation? + */ + if (!error) + rli->touching_table(m_dbnam, m_tblnam, m_table_id); + } + + /* + We explicitly do not call Log_event::exec_event() here since we do not + want the relay log position to be flushed to disk. The flushing will be + done by the last Rows_log_event that either ends a statement (outside a + transaction) or a transaction. + + A table map event can *never* end a transaction or a statement, so we + just step the relay log position. + */ + + if (likely(!error)) + rli->inc_event_relay_log_pos(); + + DBUG_RETURN(error); +} +#endif /* !defined(MYSQL_CLIENT) && defined(HAVE_REPLICATION) */ + +#ifndef MYSQL_CLIENT +bool Table_map_log_event::write_data_header(IO_CACHE *file) +{ + DBUG_ASSERT(m_table_id != ULONG_MAX); + byte buf[TABLE_MAP_HEADER_LEN]; + DBUG_EXECUTE_IF("old_row_based_repl_4_byte_map_id_master", + { + int4store(buf + 0, m_table_id); + int2store(buf + 4, m_flags); + return (my_b_safe_write(file, buf, 6)); + }); + int6store(buf + TM_MAPID_OFFSET, (ulonglong)m_table_id); + int2store(buf + TM_FLAGS_OFFSET, m_flags); + return (my_b_safe_write(file, buf, TABLE_MAP_HEADER_LEN)); +} + +bool Table_map_log_event::write_data_body(IO_CACHE *file) +{ + DBUG_ASSERT(m_dbnam != NULL); + DBUG_ASSERT(m_tblnam != NULL); + /* We use only one byte per length for storage in event: */ + DBUG_ASSERT(m_dblen < 128); + DBUG_ASSERT(m_tbllen < 128); + + byte const dbuf[]= { m_dblen }; + byte const tbuf[]= { m_tbllen }; + + byte cbuf[my_vle_sizeof(m_colcnt)]; + byte *const cbuf_end= my_vle_encode(cbuf, sizeof(cbuf), m_colcnt); + DBUG_ASSERT(static_cast<my_size_t>(cbuf_end - cbuf) <= sizeof(cbuf)); + + return (my_b_safe_write(file, dbuf, sizeof(dbuf)) || + my_b_safe_write(file, m_dbnam, m_dblen+1) || + my_b_safe_write(file, tbuf, sizeof(tbuf)) || + my_b_safe_write(file, m_tblnam, m_tbllen+1) || + my_b_safe_write(file, cbuf, cbuf_end - cbuf) || + my_b_safe_write(file, reinterpret_cast<char*>(m_coltype), m_colcnt)); + } +#endif + +#if defined(HAVE_REPLICATION) && !defined(MYSQL_CLIENT) + +/* + Print some useful information for the SHOW BINARY LOG information + field. + */ + +void Table_map_log_event::pack_info(Protocol *protocol) +{ + char buf[256]; + my_size_t bytes= snprintf(buf, sizeof(buf), "%s.%s", m_dbnam, m_tblnam); + protocol->store(buf, bytes, &my_charset_bin); +} + +#endif + + +#ifdef MYSQL_CLIENT +void Table_map_log_event::print(FILE *file, PRINT_EVENT_INFO *print_event_info) +{ + if (!print_event_info->short_form) + { + print_header(file, print_event_info); + fprintf(file, "\tTable_map: `%s`.`%s` mapped to number %lu\n", + m_dbnam, m_tblnam, m_table_id); + print_base64(file, print_event_info); + } +} +#endif + +/************************************************************************** + Write_rows_log_event member functions +**************************************************************************/ + +/* + Constructor used to build an event for writing to the binary log. + */ +#if !defined(MYSQL_CLIENT) +Write_rows_log_event::Write_rows_log_event(THD *thd_arg, TABLE *tbl_arg, + ulong tid_arg, + MY_BITMAP const *cols, + bool is_transactional) + : Rows_log_event(thd_arg, tbl_arg, tid_arg, cols, is_transactional) +{ +} +#endif + +/* + Constructor used by slave to read the event from the binary log. + */ +#ifdef HAVE_REPLICATION +Write_rows_log_event::Write_rows_log_event(const char *buf, uint event_len, + const Format_description_log_event + *description_event) +: Rows_log_event(buf, event_len, WRITE_ROWS_EVENT, description_event) +{ +} +#endif + +#if !defined(MYSQL_CLIENT) && defined(HAVE_REPLICATION) +int Write_rows_log_event::do_before_row_operations(TABLE *table) +{ + int error= 0; + + /* + We are using REPLACE semantics and not INSERT IGNORE semantics + when writing rows, that is: new rows replace old rows. We need to + inform the storage engine that it should use this behaviour. + */ + + /* Tell the storage engine that we are using REPLACE semantics. */ + thd->lex->duplicates= DUP_REPLACE; + + /* + Pretend we're executing a REPLACE command: this is needed for + InnoDB and NDB Cluster since they are not (properly) checking the + lex->duplicates flag. + */ + thd->lex->sql_command= SQLCOM_REPLACE; + + table->file->extra(HA_EXTRA_IGNORE_DUP_KEY); // needed for ndbcluster + /* + TODO: the cluster team (Tomas?) says that it's better if the engine knows + how many rows are going to be inserted, then it can allocate needed memory + from the start. + */ + table->file->start_bulk_insert(0); + /* + We need TIMESTAMP_NO_AUTO_SET otherwise ha_write_row() will not use fill + any TIMESTAMP column with data from the row but instead will use + the event's current time. + As we replicate from TIMESTAMP to TIMESTAMP and slave has no extra + columns, we know that all TIMESTAMP columns on slave will receive explicit + data from the row, so TIMESTAMP_NO_AUTO_SET is ok. + When we allow a table without TIMESTAMP to be replicated to a table having + more columns including a TIMESTAMP column, or when we allow a TIMESTAMP + column to be replicated into a BIGINT column and the slave's table has a + TIMESTAMP column, then the slave's TIMESTAMP column will take its value + from set_time() which we called earlier (consistent with SBR). And then in + some cases we won't want TIMESTAMP_NO_AUTO_SET (will require some code to + analyze if explicit data is provided for slave's TIMESTAMP columns). + */ + table->timestamp_field_type= TIMESTAMP_NO_AUTO_SET; + return error; +} + +int Write_rows_log_event::do_after_row_operations(TABLE *table, int error) +{ + if (error == 0) + error= table->file->end_bulk_insert(); + return error; +} + +char const *Write_rows_log_event::do_prepare_row(THD *thd, TABLE *table, + char const *row_start) +{ + char const *ptr= row_start; + DBUG_ASSERT(table != NULL); + /* + This assertion actually checks that there is at least as many + columns on the slave as on the master. + */ + DBUG_ASSERT(table->s->fields >= m_width); + DBUG_ASSERT(ptr); + ptr= unpack_row(table, table->record[0], ptr, &m_cols); + return ptr; +} + +/* + Check if there are more UNIQUE keys after the given key. +*/ +static int +last_uniq_key(TABLE *table, uint keyno) +{ + while (++keyno < table->s->keys) + if (table->key_info[keyno].flags & HA_NOSAME) + return 0; + return 1; +} + +/* Anonymous namespace for template functions/classes */ +namespace { + + /* + Smart pointer that will automatically call my_afree (a macro) when + the pointer goes out of scope. This is used so that I do not have + to remember to call my_afree() before each return. There is no + overhead associated with this, since all functions are inline. + + I (Matz) would prefer to use the free function as a template + parameter, but that is not possible when the "function" is a + macro. + */ + template <class Obj> + class auto_afree_ptr + { + Obj* m_ptr; + public: + auto_afree_ptr(Obj* ptr) : m_ptr(ptr) { } + ~auto_afree_ptr() { if (m_ptr) my_afree(m_ptr); } + void assign(Obj* ptr) { + /* Only to be called if it hasn't been given a value before. */ + DBUG_ASSERT(m_ptr == NULL); + m_ptr= ptr; + } + Obj* get() { return m_ptr; } + }; + +} + + +/* + Replace the provided record in the database. + + Similar to how it is done in <code>mysql_insert()</code>, we first + try to do a <code>ha_write_row()</code> and of that fails due to + duplicated keys (or indices), we do an <code>ha_update_row()</code> + or a <code>ha_delete_row()</code> instead. + + @param thd Thread context for writing the record. + @param table Table to which record should be written. + + @return Error code on failure, 0 on success. + */ +static int +replace_record(THD *thd, TABLE *table) +{ + DBUG_ASSERT(table != NULL && thd != NULL); + + int error; + int keynum; + auto_afree_ptr<char> key(NULL); + + while ((error= table->file->ha_write_row(table->record[0]))) + { + if ((keynum= table->file->get_dup_key(error)) < 0) + { + /* We failed to retrieve the duplicate key */ + return HA_ERR_FOUND_DUPP_KEY; + } + + /* + We need to retrieve the old row into record[1] to be able to + either update or delete the offending record. We either: + + - use rnd_pos() with a row-id (available as dupp_row) to the + offending row, if that is possible (MyISAM and Blackhole), or else + + - use index_read_idx() with the key that is duplicated, to + retrieve the offending row. + */ + if (table->file->table_flags() & HA_DUPP_POS) + { + error= table->file->rnd_pos(table->record[1], table->file->dupp_ref); + if (error) + return error; + } + else + { + if (table->file->extra(HA_EXTRA_FLUSH_CACHE)) + { + return my_errno; + } + + if (key.get() == NULL) + { + key.assign(static_cast<char*>(my_alloca(table->s->max_unique_length))); + if (key.get() == NULL) + return ENOMEM; + } + + key_copy(key.get(), table->record[0], table->key_info + keynum, 0); + error= table->file->index_read_idx(table->record[1], keynum, key.get(), + table->key_info[keynum].key_length, + HA_READ_KEY_EXACT); + if (error) + return error; + } + + /* + Now, table->record[1] should contain the offending row. That + will enable us to update it or, alternatively, delete it (so + that we can insert the new row afterwards). + + REPLACE is defined as either INSERT or DELETE + INSERT. If + possible, we can replace it with an UPDATE, but that will not + work on InnoDB if FOREIGN KEY checks are necessary. + + I (Matz) am not sure of the reason for the last_uniq_key() + check as, but I'm guessing that it's something along the + following lines. + + Suppose that we got the duplicate key to be a key that is not + the last unique key for the table and we perform an update: + then there might be another key for which the unique check will + fail, so we're better off just deleting the row and inserting + the correct row. + */ + if (last_uniq_key(table, keynum) && + !table->file->referenced_by_foreign_key()) + { + error=table->file->ha_update_row(table->record[1], + table->record[0]); + return error; + } + else + { + if ((error= table->file->ha_delete_row(table->record[1]))) + return error; + /* Will retry ha_write_row() with the offending row removed. */ + } + } + return error; +} + +int Write_rows_log_event::do_exec_row(TABLE *table) +{ + DBUG_ASSERT(table != NULL); + int error= replace_record(thd, table); + return error; +} +#endif /* !defined(MYSQL_CLIENT) && defined(HAVE_REPLICATION) */ + +#ifdef MYSQL_CLIENT +void Write_rows_log_event::print(FILE *file, PRINT_EVENT_INFO* print_event_info) +{ + if (!print_event_info->short_form) + { + print_header(file, print_event_info); + fprintf(file, "\tWrite_rows: table id %lu", m_table_id); + print_base64(file, print_event_info); + } +} +#endif + +/************************************************************************** + Delete_rows_log_event member functions +**************************************************************************/ + +#if !defined(MYSQL_CLIENT) && defined(HAVE_REPLICATION) +static int record_compare(TABLE *table, byte const *a, byte const *b) +{ + for (my_size_t i= 0 ; i < table->s->fields ; ++i) + { + uint const off= table->field[i]->offset(); + uint const res= table->field[i]->cmp_binary(a + off, b + off); + if (res != 0) { + return res; + } + } + return 0; +} + + +/* + Find the row given by 'key', if the table has keys, or else use a table scan + to find (and fetch) the row. If the engine allows random access of the + records, a combination of position() and rnd_pos() will be used. + + The 'record_buf' will be used as buffer for records while locating the + correct row. + */ +static int find_and_fetch_row(TABLE *table, byte *key, byte *record_buf) +{ + DBUG_ENTER("find_and_fetch_row(TABLE *table, byte *key, byte *record)"); + DBUG_PRINT("enter", ("table=%p, key=%p, record=%p", + table, key, record_buf)); + + DBUG_ASSERT(table->in_use != NULL); + + if ((table->file->table_flags() & HA_PRIMARY_KEY_ALLOW_RANDOM_ACCESS) + && table->s->primary_key < MAX_KEY) + { + /* + Use a more efficient method to fetch the record given by + table->record[0] if the engine allows it. We first compute a + row reference using the position() member function (it will be + stored in table->file->ref) and the use rnd_pos() to position + the "cursor" at the correct row. + */ + table->file->position(table->record[0]); + DBUG_RETURN(table->file->rnd_pos(table->record[0], table->file->ref)); + } + + DBUG_ASSERT(record_buf); + + if (table->s->keys > 0) + { + int error; + if ((error= table->file->index_read_idx(record_buf, 0, key, + table->key_info->key_length, + HA_READ_KEY_EXACT))) + { + table->file->print_error(error, MYF(0)); + DBUG_RETURN(error); + } + + /* + Below is a minor "optimization". If the key (i.e., key number + 0) has the HA_NOSAME flag set, we know that we have found the + correct record (since there can be no duplicates); otherwise, we + have to compare the record with the one found to see if it is + the correct one. + + CAVEAT! This behaviour is essential for the replication of, + e.g., the mysql.proc table since the correct record *shall* be + found using the primary key *only*. There shall be no + comparison of non-PK columns to decide if the correct record is + found. I can see no scenario where it would be incorrect to + chose the row to change only using a PK or an UNNI. + */ + if (table->key_info->flags & HA_NOSAME) + DBUG_RETURN(0); + + while (record_compare(table, table->record[0], record_buf) != 0) + { + int error; + if ((error= table->file->index_next(record_buf))) + { + table->file->print_error(error, MYF(0)); + DBUG_RETURN(error); + } + } + } + else + { + /* Continue until we find the right record or have made a full loop */ + int restart_count= 0; // Number of times scanning has restarted from top + int error= 0; + do + { + error= table->file->rnd_next(record_buf); + switch (error) + { + case 0: + case HA_ERR_RECORD_DELETED: + break; + + case HA_ERR_END_OF_FILE: + if (++restart_count < 2) + table->file->ha_rnd_init(1); + break; + + default: + table->file->print_error(error, MYF(0)); + DBUG_RETURN(error); + } + } + while (restart_count < 2 && + record_compare(table, table->record[0], record_buf) != 0); + + DBUG_ASSERT(error == HA_ERR_END_OF_FILE || error == 0); + DBUG_RETURN(error); + } + + DBUG_RETURN(0); +} +#endif + +/* + Constructor used to build an event for writing to the binary log. + */ + +#ifndef MYSQL_CLIENT +Delete_rows_log_event::Delete_rows_log_event(THD *thd_arg, TABLE *tbl_arg, + ulong tid, MY_BITMAP const *cols, + bool is_transactional) + : Rows_log_event(thd_arg, tbl_arg, tid, cols, is_transactional) +#ifdef HAVE_REPLICATION + ,m_memory(NULL), m_key(NULL), m_search_record(NULL) +#endif +{ +} +#endif /* #if !defined(MYSQL_CLIENT) */ + +/* + Constructor used by slave to read the event from the binary log. + */ +#ifdef HAVE_REPLICATION +Delete_rows_log_event::Delete_rows_log_event(const char *buf, uint event_len, + const Format_description_log_event + *description_event) +#if defined(MYSQL_CLIENT) + : Rows_log_event(buf, event_len, DELETE_ROWS_EVENT, description_event) +#else + : Rows_log_event(buf, event_len, DELETE_ROWS_EVENT, description_event), + m_memory(NULL), m_key(NULL), m_search_record(NULL) +#endif +{ +} +#endif + +#if !defined(MYSQL_CLIENT) && defined(HAVE_REPLICATION) +int Delete_rows_log_event::do_before_row_operations(TABLE *table) +{ + DBUG_ASSERT(m_memory == NULL); + + if ((table->file->table_flags() & HA_PRIMARY_KEY_ALLOW_RANDOM_ACCESS) && + table->s->primary_key < MAX_KEY) + { + /* + We don't need to allocate any memory for m_search_record and + m_key since they are not used. + */ + return 0; + } + + int error= 0; + + if (table->s->keys > 0) + { + m_memory= + my_multi_malloc(MYF(MY_WME), + &m_search_record, table->s->reclength, + &m_key, table->key_info->key_length, + NULL); + } + else + { + m_memory= m_search_record= my_malloc(table->s->reclength, MYF(MY_WME)); + m_key= NULL; + } + if (!m_memory) + return HA_ERR_OUT_OF_MEM; + + if (table->s->keys > 0) + { + /* We have a key: search the table using the index */ + if (!table->file->inited) + error= table->file->ha_index_init(0, FALSE); + } + else + { + /* We doesn't have a key: search the table using rnd_next() */ + error= table->file->ha_rnd_init(1); + } + + return error; +} + +int Delete_rows_log_event::do_after_row_operations(TABLE *table, int error) +{ + /*error= ToDo:find out what this should really be, this triggers close_scan in nbd, returning error?*/ + table->file->ha_index_or_rnd_end(); + my_free(m_memory, MYF(MY_ALLOW_ZERO_PTR)); // Free for multi_malloc + m_memory= m_search_record= m_key= NULL; + + return error; +} + +char const *Delete_rows_log_event::do_prepare_row(THD *thd, TABLE *table, + char const *row_start) +{ + char const *ptr= row_start; + DBUG_ASSERT(ptr); + /* + This assertion actually checks that there is at least as many + columns on the slave as on the master. + */ + DBUG_ASSERT(table->s->fields >= m_width); + + DBUG_ASSERT(ptr != NULL); + ptr= unpack_row(table, table->record[0], ptr, &m_cols); + + /* + If we will access rows using the random access method, m_key will + be set to NULL, so we do not need to make a key copy in that case. + */ + if (m_key) + { + KEY *const key_info= table->key_info; + + key_copy(m_key, table->record[0], key_info, 0); + } + + return ptr; +} + +int Delete_rows_log_event::do_exec_row(TABLE *table) +{ + DBUG_ASSERT(table != NULL); + + int error= find_and_fetch_row(table, m_key, m_search_record); + if (error) + return error; + + /* + Now we should have the right row to delete. We are using + record[0] since it is guaranteed to point to a record with the + correct value. + */ + error= table->file->ha_delete_row(table->record[0]); + + return error; +} + +#endif /* !defined(MYSQL_CLIENT) && defined(HAVE_REPLICATION) */ + +#ifdef MYSQL_CLIENT +void Delete_rows_log_event::print(FILE *file, + PRINT_EVENT_INFO* print_event_info) +{ + if (!print_event_info->short_form) + { + print_header(file, print_event_info); + fprintf(file, "\tDelete_rows: table id %lu", m_table_id); + print_base64(file, print_event_info); + } +} +#endif + + +/************************************************************************** + Update_rows_log_event member functions +**************************************************************************/ + +/* + Constructor used to build an event for writing to the binary log. + */ +#if !defined(MYSQL_CLIENT) +Update_rows_log_event::Update_rows_log_event(THD *thd_arg, TABLE *tbl_arg, + ulong tid, MY_BITMAP const *cols, + bool is_transactional) +: Rows_log_event(thd_arg, tbl_arg, tid, cols, is_transactional) +#ifdef HAVE_REPLICATION + , m_memory(NULL), m_key(NULL) +#endif +{ +} +#endif /* !defined(MYSQL_CLIENT) */ + +/* + Constructor used by slave to read the event from the binary log. + */ +#ifdef HAVE_REPLICATION +Update_rows_log_event::Update_rows_log_event(const char *buf, uint event_len, + const + Format_description_log_event + *description_event) +#if defined(MYSQL_CLIENT) + : Rows_log_event(buf, event_len, UPDATE_ROWS_EVENT, description_event) +#else + : Rows_log_event(buf, event_len, UPDATE_ROWS_EVENT, description_event), + m_memory(NULL), m_key(NULL) +#endif +{ +} +#endif + +#if !defined(MYSQL_CLIENT) && defined(HAVE_REPLICATION) +int Update_rows_log_event::do_before_row_operations(TABLE *table) +{ + DBUG_ASSERT(m_memory == NULL); + + if ((table->file->table_flags() & HA_PRIMARY_KEY_ALLOW_RANDOM_ACCESS) && + table->s->primary_key < MAX_KEY) + { + /* + We don't need to allocate any memory for m_search_record and + m_key since they are not used. + */ + return 0; + } + + int error= 0; + + if (table->s->keys > 0) + { + m_memory= + my_multi_malloc(MYF(MY_WME), + &m_search_record, table->s->reclength, + &m_key, table->key_info->key_length, + NULL); + } + else + { + m_memory= m_search_record= my_malloc(table->s->reclength, MYF(MY_WME)); + m_key= NULL; + } + if (!m_memory) + return HA_ERR_OUT_OF_MEM; + + if (table->s->keys > 0) + { + /* We have a key: search the table using the index */ + if (!table->file->inited) + error= table->file->ha_index_init(0, FALSE); + } + else + { + /* We doesn't have a key: search the table using rnd_next() */ + error= table->file->ha_rnd_init(1); + } + table->timestamp_field_type= TIMESTAMP_NO_AUTO_SET; + + return error; +} + +int Update_rows_log_event::do_after_row_operations(TABLE *table, int error) +{ + /*error= ToDo:find out what this should really be, this triggers close_scan in nbd, returning error?*/ + table->file->ha_index_or_rnd_end(); + my_free(m_memory, MYF(MY_ALLOW_ZERO_PTR)); + m_memory= m_search_record= m_key= NULL; + + return error; +} + +char const *Update_rows_log_event::do_prepare_row(THD *thd, TABLE *table, + char const *row_start) +{ + char const *ptr= row_start; + DBUG_ASSERT(ptr); + /* + This assertion actually checks that there is at least as many + columns on the slave as on the master. + */ + DBUG_ASSERT(table->s->fields >= m_width); + + /* record[0] is the before image for the update */ + ptr= unpack_row(table, table->record[0], ptr, &m_cols); + DBUG_ASSERT(ptr != NULL); + /* record[1] is the after image for the update */ + ptr= unpack_row(table, table->record[1], ptr, &m_cols); + + /* + If we will access rows using the random access method, m_key will + be set to NULL, so we do not need to make a key copy in that case. + */ + if (m_key) + { + KEY *const key_info= table->key_info; + + key_copy(m_key, table->record[0], key_info, 0); + } + + return ptr; +} + +int Update_rows_log_event::do_exec_row(TABLE *table) +{ + DBUG_ASSERT(table != NULL); + + int error= find_and_fetch_row(table, m_key, m_search_record); + if (error) + return error; + + /* + Now we should have the right row to update. The record that has + been fetched is guaranteed to be in record[0], so we use that. + */ + error= table->file->ha_update_row(table->record[0], table->record[1]); + + return error; +} +#endif /* !defined(MYSQL_CLIENT) && defined(HAVE_REPLICATION) */ + +#ifdef MYSQL_CLIENT +void Update_rows_log_event::print(FILE *file, + PRINT_EVENT_INFO* print_event_info) +{ + if (!print_event_info->short_form) + { + print_header(file, print_event_info); + fprintf(file, "\tUpdate_rows: table id %lu", m_table_id); + print_base64(file, print_event_info); + } +} +#endif + +#endif /* defined(HAVE_ROW_BASED_REPLICATION) */ diff --git a/sql/log_event.h b/sql/log_event.h index 7783a97f03f..5d58a204ec9 100644 --- a/sql/log_event.h +++ b/sql/log_event.h @@ -26,6 +26,16 @@ #pragma interface /* gcc class implementation */ #endif +#include <my_bitmap.h> + +#if !defined(MYSQL_CLIENT) +#ifdef HAVE_ROW_BASED_REPLICATION +extern my_bool binlog_row_based; +#else +extern const my_bool binlog_row_based; +#endif +#endif + #define LOG_READ_EOF -1 #define LOG_READ_BOGUS -2 #define LOG_READ_IO -3 @@ -196,6 +206,8 @@ struct sql_ex_info #define EXEC_LOAD_HEADER_LEN 4 #define DELETE_FILE_HEADER_LEN 4 #define FORMAT_DESCRIPTION_HEADER_LEN (START_V3_HEADER_LEN+1+LOG_EVENT_TYPES) +#define ROWS_HEADER_LEN 8 +#define TABLE_MAP_HEADER_LEN 8 #define EXECUTE_LOAD_QUERY_EXTRA_HEADER_LEN (4 + 4 + 4 + 1) #define EXECUTE_LOAD_QUERY_HEADER_LEN (QUERY_HEADER_LEN + EXECUTE_LOAD_QUERY_EXTRA_HEADER_LEN) @@ -302,6 +314,14 @@ struct sql_ex_info /* DF = "Delete File" */ #define DF_FILE_ID_OFFSET 0 +/* TM = "Table Map" */ +#define TM_MAPID_OFFSET 0 +#define TM_FLAGS_OFFSET 6 + +/* RW = "RoWs" */ +#define RW_MAPID_OFFSET 0 +#define RW_FLAGS_OFFSET 6 + /* ELQ = "Execute Load Query" */ #define ELQ_FILE_ID_OFFSET QUERY_HEADER_LEN #define ELQ_FN_POS_START_OFFSET ELQ_FILE_ID_OFFSET + 4 @@ -373,6 +393,12 @@ struct sql_ex_info #define LOG_EVENT_SUPPRESS_USE_F 0x8 /* + The table map version internal to the log should be increased after + the event has been written to the binary log. + */ +#define LOG_EVENT_UPDATE_TABLE_MAP_VERSION_F 0x10 + +/* OPTIONS_WRITTEN_TO_BIN_LOG are the bits of thd->options which must be written to the binlog. OPTIONS_WRITTEN_TO_BINLOG could be written into the Format_description_log_event, so that if later we don't want @@ -427,6 +453,10 @@ enum Log_event_type XID_EVENT= 16, BEGIN_LOAD_QUERY_EVENT= 17, EXECUTE_LOAD_QUERY_EVENT= 18, + TABLE_MAP_EVENT = 19, + WRITE_ROWS_EVENT = 20, + UPDATE_ROWS_EVENT = 21, + DELETE_ROWS_EVENT = 22, /* Add new events here - right above this comment! @@ -504,6 +534,7 @@ typedef struct st_print_event_info /* Settings on how to print the events */ bool short_form; + bool base64_output; my_off_t hexdump_from; uint8 common_header_len; @@ -616,9 +647,10 @@ public: static Log_event* read_log_event(IO_CACHE* file, const Format_description_log_event *description_event); /* print*() functions are used by mysqlbinlog */ - virtual void print(FILE* file, PRINT_EVENT_INFO* print_event_info= 0) = 0; + virtual void print(FILE* file, PRINT_EVENT_INFO* print_event_info) = 0; void print_timestamp(FILE* file, time_t *ts = 0); - void print_header(FILE* file, PRINT_EVENT_INFO* print_event_info= 0); + void print_header(FILE* file, PRINT_EVENT_INFO* print_event_info); + void print_base64(FILE* file, PRINT_EVENT_INFO* print_event_info); #endif static void *operator new(size_t size) @@ -646,7 +678,7 @@ public: virtual Log_event_type get_type_code() = 0; virtual bool is_valid() const = 0; virtual bool is_artificial_event() { return 0; } - inline bool get_cache_stmt() { return cache_stmt; } + inline bool get_cache_stmt() const { return cache_stmt; } Log_event(const char* buf, const Format_description_log_event* description_event); virtual ~Log_event() { free_temp_buf();} void register_temp_buf(char* buf) { temp_buf = buf; } @@ -778,8 +810,8 @@ public: uint32 q_len_arg); #endif /* HAVE_REPLICATION */ #else - void print_query_header(FILE* file, PRINT_EVENT_INFO* print_event_info= 0); - void print(FILE* file, PRINT_EVENT_INFO* print_event_info= 0); + void print_query_header(FILE* file, PRINT_EVENT_INFO* print_event_info); + void print(FILE* file, PRINT_EVENT_INFO* print_event_info); #endif Query_log_event(const char* buf, uint event_len, @@ -833,7 +865,7 @@ public: void pack_info(Protocol* protocol); int exec_event(struct st_relay_log_info* rli); #else - void print(FILE* file, PRINT_EVENT_INFO* print_event_info= 0); + void print(FILE* file, PRINT_EVENT_INFO* print_event_info); #endif Slave_log_event(const char* buf, uint event_len); @@ -921,7 +953,7 @@ public: bool use_rli_only_for_errors); #endif /* HAVE_REPLICATION */ #else - void print(FILE* file, PRINT_EVENT_INFO* print_event_info = 0); + void print(FILE* file, PRINT_EVENT_INFO* print_event_info); void print(FILE* file, PRINT_EVENT_INFO* print_event_info, bool commented); #endif @@ -1011,7 +1043,7 @@ public: #endif /* HAVE_REPLICATION */ #else Start_log_event_v3() {} - void print(FILE* file, PRINT_EVENT_INFO* print_event_info= 0); + void print(FILE* file, PRINT_EVENT_INFO* print_event_info); #endif Start_log_event_v3(const char* buf, @@ -1106,7 +1138,7 @@ public: int exec_event(struct st_relay_log_info* rli); #endif /* HAVE_REPLICATION */ #else - void print(FILE* file, PRINT_EVENT_INFO* print_event_info= 0); + void print(FILE* file, PRINT_EVENT_INFO* print_event_info); #endif Intvar_log_event(const char* buf, const Format_description_log_event* description_event); @@ -1147,7 +1179,7 @@ class Rand_log_event: public Log_event int exec_event(struct st_relay_log_info* rli); #endif /* HAVE_REPLICATION */ #else - void print(FILE* file, PRINT_EVENT_INFO* print_event_info= 0); + void print(FILE* file, PRINT_EVENT_INFO* print_event_info); #endif Rand_log_event(const char* buf, const Format_description_log_event* description_event); @@ -1184,7 +1216,7 @@ class Xid_log_event: public Log_event int exec_event(struct st_relay_log_info* rli); #endif /* HAVE_REPLICATION */ #else - void print(FILE* file, PRINT_EVENT_INFO* print_event_info= 0); + void print(FILE* file, PRINT_EVENT_INFO* print_event_info); #endif Xid_log_event(const char* buf, const Format_description_log_event* description_event); @@ -1195,6 +1227,9 @@ class Xid_log_event: public Log_event bool write(IO_CACHE* file); #endif bool is_valid() const { return 1; } +#if !defined(DBUG_OFF) && !defined(MYSQL_CLIENT) + static my_bool show_xid; +#endif }; /***************************************************************************** @@ -1226,7 +1261,7 @@ public: void pack_info(Protocol* protocol); int exec_event(struct st_relay_log_info* rli); #else - void print(FILE* file, PRINT_EVENT_INFO* print_event_info= 0); + void print(FILE* file, PRINT_EVENT_INFO* print_event_info); #endif User_var_log_event(const char* buf, const Format_description_log_event* description_event); @@ -1252,7 +1287,7 @@ public: {} int exec_event(struct st_relay_log_info* rli); #else - void print(FILE* file, PRINT_EVENT_INFO* print_event_info= 0); + void print(FILE* file, PRINT_EVENT_INFO* print_event_info); #endif Stop_log_event(const char* buf, const Format_description_log_event* description_event): @@ -1282,7 +1317,7 @@ public: uint ident_len; uint flags; #ifndef MYSQL_CLIENT - Rotate_log_event(THD* thd_arg, const char* new_log_ident_arg, + Rotate_log_event(const char* new_log_ident_arg, uint ident_len_arg, ulonglong pos_arg, uint flags); #ifdef HAVE_REPLICATION @@ -1290,7 +1325,7 @@ public: int exec_event(struct st_relay_log_info* rli); #endif /* HAVE_REPLICATION */ #else - void print(FILE* file, PRINT_EVENT_INFO* print_event_info= 0); + void print(FILE* file, PRINT_EVENT_INFO* print_event_info); #endif Rotate_log_event(const char* buf, uint event_len, @@ -1343,7 +1378,7 @@ public: int exec_event(struct st_relay_log_info* rli); #endif /* HAVE_REPLICATION */ #else - void print(FILE* file, PRINT_EVENT_INFO* print_event_info= 0); + void print(FILE* file, PRINT_EVENT_INFO* print_event_info); void print(FILE* file, PRINT_EVENT_INFO* print_event_info, bool enable_local); #endif @@ -1411,7 +1446,7 @@ public: virtual int get_create_or_append() const; #endif /* HAVE_REPLICATION */ #else - void print(FILE* file, PRINT_EVENT_INFO* print_event_info= 0); + void print(FILE* file, PRINT_EVENT_INFO* print_event_info); #endif Append_block_log_event(const char* buf, uint event_len, @@ -1422,8 +1457,8 @@ public: bool is_valid() const { return block != 0; } #ifndef MYSQL_CLIENT bool write(IO_CACHE* file); -#endif const char* get_db() { return db; } +#endif }; @@ -1446,7 +1481,7 @@ public: int exec_event(struct st_relay_log_info* rli); #endif /* HAVE_REPLICATION */ #else - void print(FILE* file, PRINT_EVENT_INFO* print_event_info= 0); + void print(FILE* file, PRINT_EVENT_INFO* print_event_info); void print(FILE* file, PRINT_EVENT_INFO* print_event_info, bool enable_local); #endif @@ -1458,8 +1493,8 @@ public: bool is_valid() const { return file_id != 0; } #ifndef MYSQL_CLIENT bool write(IO_CACHE* file); -#endif const char* get_db() { return db; } +#endif }; @@ -1482,7 +1517,7 @@ public: int exec_event(struct st_relay_log_info* rli); #endif /* HAVE_REPLICATION */ #else - void print(FILE* file, PRINT_EVENT_INFO* print_event_info= 0); + void print(FILE* file, PRINT_EVENT_INFO* print_event_info); #endif Execute_load_log_event(const char* buf, uint event_len, @@ -1493,8 +1528,8 @@ public: bool is_valid() const { return file_id != 0; } #ifndef MYSQL_CLIENT bool write(IO_CACHE* file); -#endif const char* get_db() { return db; } +#endif }; @@ -1567,7 +1602,7 @@ public: int exec_event(struct st_relay_log_info* rli); #endif /* HAVE_REPLICATION */ #else - void print(FILE* file, PRINT_EVENT_INFO* print_event_info= 0); + void print(FILE* file, PRINT_EVENT_INFO* print_event_info); /* Prints the query as LOAD DATA LOCAL and with rewritten filename */ void print(FILE* file, PRINT_EVENT_INFO* print_event_info, const char *local_fname); @@ -1599,10 +1634,523 @@ public: Log_event(buf, description_event) {} ~Unknown_log_event() {} - void print(FILE* file, PRINT_EVENT_INFO* print_event_info= 0); + void print(FILE* file, PRINT_EVENT_INFO* print_event_info); Log_event_type get_type_code() { return UNKNOWN_EVENT;} bool is_valid() const { return 1; } }; #endif char *str_to_hex(char *to, const char *from, uint len); + +/***************************************************************************** + + Table map log event class + + Create a mapping from a (database name, table name) couple to a table + identifier (an integer number). + + ****************************************************************************/ + +class Table_map_log_event : public Log_event +{ +public: + /* Constants */ + enum + { + TYPE_CODE = TABLE_MAP_EVENT + }; + + enum enum_error + { + ERR_OPEN_FAILURE = -1, /* Failure to open table */ + ERR_OK = 0, /* No error */ + ERR_TABLE_LIMIT_EXCEEDED = 1, /* No more room for tables */ + ERR_OUT_OF_MEM = 2, /* Out of memory */ + ERR_BAD_TABLE_DEF = 3, /* Table definition does not match */ + ERR_RBR_TO_SBR = 4 /* daisy-chanining RBR to SBR not allowed */ + }; + + enum enum_flag + { + /* + Nothing here right now, but the flags support is there in + preparation for changes that are coming. + */ + }; + + typedef uint16 flag_set; + + /* Special constants representing sets of flags */ + enum + { + NO_FLAGS = 0U + }; + + void set_flags(flag_set flag) { m_flags |= flag; } + void clear_flags(flag_set flag) { m_flags &= ~flag; } + flag_set get_flags(flag_set flag) const { return m_flags & flag; } + +#ifndef MYSQL_CLIENT + Table_map_log_event(THD *thd, TABLE *tbl, ulong tid, + bool is_transactional, uint16 flags); +#endif +#ifdef HAVE_REPLICATION + Table_map_log_event(const char *buf, uint event_len, + const Format_description_log_event *description_event); +#endif + + ~Table_map_log_event(); + + virtual Log_event_type get_type_code() { return TABLE_MAP_EVENT; } + virtual bool is_valid() const { return m_memory; /* we check malloc */ } + + virtual int get_data_size() { return m_data_size; } +#ifndef MYSQL_CLIENT + virtual bool write_data_header(IO_CACHE *file); + virtual bool write_data_body(IO_CACHE *file); + virtual const char *get_db() { return m_dbnam; } +#endif + +#if !defined(MYSQL_CLIENT) && defined(HAVE_REPLICATION) + virtual int exec_event(struct st_relay_log_info *rli); + virtual void pack_info(Protocol *protocol); +#endif + +#ifdef MYSQL_CLIENT + virtual void print(FILE *file, PRINT_EVENT_INFO *print_event_info); +#endif + + +private: +#ifndef MYSQL_CLIENT + TABLE *m_table; +#endif + char const *m_dbnam; + my_size_t m_dblen; + char const *m_tblnam; + my_size_t m_tbllen; + ulong m_colcnt; + unsigned char *m_coltype; + + gptr m_memory; + ulong m_table_id; + flag_set m_flags; + + my_size_t m_data_size; +}; + + +/***************************************************************************** + + Row level log event class. + + Common base class for all row-level log events. + + RESPONSIBILITIES + + Encode the common parts of all events containing rows, which are: + - Write data header and data body to an IO_CACHE. + - Provide an interface for adding an individual row to the event. + + ****************************************************************************/ + +class Rows_log_event : public Log_event +{ +public: + /* + These definitions allow you to combine the flags into an + appropriate flag set using the normal bitwise operators. The + implicit conversion from an enum-constant to an integer is + accepted by the compiler, which is then used to set the real set + of flags. + */ + + enum enum_flag + { + /* Last event of a statement */ + STMT_END_F = (1U << 0), + + /* Value of the OPTION_NO_FOREIGN_KEY_CHECKS flag in thd->options */ + NO_FOREIGN_KEY_CHECKS_F = (1U << 1), + + /* Value of the OPTION_RELAXED_UNIQUE_CHECKS flag in thd->options */ + RELAXED_UNIQUE_CHECKS_F = (1U << 2) + }; + + typedef uint16 flag_set; + + /* Special constants representing sets of flags */ + enum + { + NO_FLAGS = 0U + }; + + virtual ~Rows_log_event(); + + void set_flags(flag_set flags) { m_flags |= flags; } + void clear_flags(flag_set flags) { m_flags &= ~flags; } + flag_set get_flags(flag_set flags) const { return m_flags & flags; } + +#if !defined(MYSQL_CLIENT) && defined(HAVE_REPLICATION) + virtual int exec_event(struct st_relay_log_info *rli); +#ifdef DBUG_RBR + virtual void pack_info(Protocol *protocol); +#endif +#endif + +#ifdef MYSQL_CLIENT + /* not for direct call, each derived has its own ::print() */ + virtual void print(FILE *file, PRINT_EVENT_INFO *print_event_info)= 0; +#endif + +#ifndef MYSQL_CLIENT + int add_row_data(byte *data, my_size_t length) + { + return do_add_row_data(data,length); + } +#endif + + /* Member functions to implement superclass interface */ + virtual int get_data_size() + { + DBUG_EXECUTE_IF("old_row_based_repl_4_byte_map_id_master", + return 6 + 1 + no_bytes_in_map(&m_cols) + + (m_rows_cur - m_rows_buf);); + return ROWS_HEADER_LEN + 1 + no_bytes_in_map(&m_cols) + + (m_rows_cur - m_rows_buf); + } + + MY_BITMAP const *get_cols() const { return &m_cols; } + my_size_t get_width() const { return m_width; } + ulong get_table_id() const { return m_table_id; } + +#ifndef MYSQL_CLIENT + virtual bool write_data_header(IO_CACHE *file); + virtual bool write_data_body(IO_CACHE *file); + virtual const char *get_db() { return m_table->s->db.str; } +#endif + virtual bool is_valid() const + { + /* that's how we check malloc() succeeded */ + return m_rows_buf && m_cols.bitmap; + } + + /* + If there is no table map active for the event, write one to the + binary log. + + LOCK_log has to be aquired before calling this function. + + PARAMETERS + thd - Thread to use when writing the table map + + RETURN VALUE + Error code, or zero if write succeeded. + */ +#if !defined(MYSQL_CLIENT) && defined(HAVE_ROW_BASED_REPLICATION) + int maybe_write_table_map(THD *thd, IO_CACHE *file, MYSQL_LOG *log) const + { + /* + N.B., get_cache_stmt() returns the value of 'using_trans' that + was provided to the constructor, i.e., get_cache_stmt() == true + if and only if the table is transactional. + */ + + int result= 0; + if (!log->is_table_mapped(m_table)) + result= log->write_table_map(thd, file, m_table, get_cache_stmt()); + return result; + } +#endif + +protected: + /* + The constructors are protected since you're supposed to inherit + this class, not create instances of this class. + */ +#ifndef MYSQL_CLIENT + Rows_log_event(THD*, TABLE*, ulong table_id, + MY_BITMAP const *cols, bool is_transactional); +#endif + Rows_log_event(const char *row_data, uint event_len, + Log_event_type event_type, + const Format_description_log_event *description_event); + +#ifndef MYSQL_CLIENT + virtual int do_add_row_data(byte *data, my_size_t length); +#endif + +#ifndef MYSQL_CLIENT + TABLE *m_table; /* The table the rows belong to */ +#endif + ulong m_table_id; /* Table ID */ + MY_BITMAP m_cols; /* Bitmap denoting columns available */ + ulong m_width; /* The width of the columns bitmap */ + + /* Bit buffer in the same memory as the class */ + uint32 m_bitbuf[128/(sizeof(uint32)*8)]; + + byte *m_rows_buf; /* The rows in packed format */ + byte *m_rows_cur; /* One-after the end of the data */ + byte *m_rows_end; /* One-after the end of the allocated space */ + + flag_set m_flags; /* Flags for row-level events */ + +private: + +#if !defined(MYSQL_CLIENT) && defined(HAVE_REPLICATION) + /* + Primitive to prepare for a sequence of row executions. + + DESCRIPTION + + Before doing a sequence of do_prepare_row() and do_exec_row() + calls, this member function should be called to prepare for the + entire sequence. Typically, this member function will allocate + space for any buffers that are needed for the two member + functions mentioned above. + + RETURN VALUE + + The member function will return 0 if all went OK, or a non-zero + error code otherwise. + */ + virtual int do_before_row_operations(TABLE *table) = 0; + + /* + Primitive to clean up after a sequence of row executions. + + DESCRIPTION + + After doing a sequence of do_prepare_row() and do_exec_row(), + this member function should be called to clean up and release + any allocated buffers. + */ + virtual int do_after_row_operations(TABLE *table, int error) = 0; + + /* + Primitive to prepare for handling one row in a row-level event. + + DESCRIPTION + + The member function prepares for execution of operations needed for one + row in a row-level event by reading up data from the buffer containing + the row. No specific interpretation of the data is normally done here, + since SQL thread specific data is not available: that data is made + available for the do_exec function. + + RETURN VALUE + A pointer to the start of the next row, or NULL if the preparation + failed. Currently, preparation cannot fail, but don't rely on this + behavior. + */ + virtual char const *do_prepare_row(THD*, TABLE*, char const *row_start) = 0; + + /* + Primitive to do the actual execution necessary for a row. + + DESCRIPTION + The member function will do the actual execution needed to handle a row. + + RETURN VALUE + 0 if execution succeeded, 1 if execution failed. + + */ + virtual int do_exec_row(TABLE *table) = 0; +#endif /* !defined(MYSQL_CLIENT) && defined(HAVE_REPLICATION) */ +}; + + +/***************************************************************************** + + Write row log event class + + Log row insertions and updates. The event contain several + insert/update rows for a table. Note that each event contains only + rows for one table. + + ****************************************************************************/ +class Write_rows_log_event : public Rows_log_event +{ +public: + enum + { + /* Support interface to THD::binlog_prepare_pending_rows_event */ + TYPE_CODE = WRITE_ROWS_EVENT + }; + +#if !defined(MYSQL_CLIENT) + Write_rows_log_event(THD*, TABLE*, ulong table_id, + MY_BITMAP const *cols, bool is_transactional); +#endif +#ifdef HAVE_REPLICATION + Write_rows_log_event(const char *buf, uint event_len, + const Format_description_log_event *description_event); +#endif +#if !defined(MYSQL_CLIENT) && defined(HAVE_ROW_BASED_REPLICATION) + static bool binlog_row_logging_function(THD *thd, TABLE *table, + bool is_transactional, + MY_BITMAP *cols, + uint fields, + const byte *before_record + __attribute__((unused)), + const byte *after_record) + { + return thd->binlog_write_row(table, is_transactional, + cols, fields, after_record); + } +#endif + +private: + virtual Log_event_type get_type_code() { return (Log_event_type)TYPE_CODE; } + +#ifdef MYSQL_CLIENT + void print(FILE *file, PRINT_EVENT_INFO *print_event_info); +#endif + +#if !defined(MYSQL_CLIENT) && defined(HAVE_REPLICATION) + gptr m_memory; + byte *m_search_record; + + virtual int do_before_row_operations(TABLE *table); + virtual int do_after_row_operations(TABLE *table, int error); + virtual char const *do_prepare_row(THD*, TABLE*, char const *row_start); + virtual int do_exec_row(TABLE *table); +#endif +}; + + +/***************************************************************************** + + Update rows log event class + + Log row updates with a before image. The event contain several + update rows for a table. Note that each event contains only rows for + one table. + + Also note that the row data consists of pairs of row data: one row + for the old data and one row for the new data. + + ****************************************************************************/ +class Update_rows_log_event : public Rows_log_event +{ +public: + enum + { + /* Support interface to THD::binlog_prepare_pending_rows_event */ + TYPE_CODE = UPDATE_ROWS_EVENT + }; + +#ifndef MYSQL_CLIENT + Update_rows_log_event(THD*, TABLE*, ulong table_id, + MY_BITMAP const *cols, bool is_transactional); +#endif + +#ifdef HAVE_REPLICATION + Update_rows_log_event(const char *buf, uint event_len, + const Format_description_log_event *description_event); +#endif + +#if !defined(MYSQL_CLIENT) && defined(HAVE_ROW_BASED_REPLICATION) + static bool binlog_row_logging_function(THD *thd, TABLE *table, + bool is_transactional, + MY_BITMAP *cols, + uint fields, + const byte *before_record, + const byte *after_record) + { + return thd->binlog_update_row(table, is_transactional, + cols, fields, before_record, after_record); + } +#endif + +private: + virtual Log_event_type get_type_code() { return (Log_event_type)TYPE_CODE; } + +#ifdef MYSQL_CLIENT + void print(FILE *file, PRINT_EVENT_INFO *print_event_info); +#endif + +#if !defined(MYSQL_CLIENT) && defined(HAVE_REPLICATION) + gptr m_memory; + byte *m_key; + byte *m_search_record; + + virtual int do_before_row_operations(TABLE *table); + virtual int do_after_row_operations(TABLE *table, int error); + virtual char const *do_prepare_row(THD*, TABLE*, char const *row_start); + virtual int do_exec_row(TABLE *table); +#endif /* !defined(MYSQL_CLIENT) && defined(HAVE_REPLICATION) */ +}; + +/***************************************************************************** + + Delete rows log event class. + + Log row deletions. The event contain several delete rows for a + table. Note that each event contains only rows for one table. + + RESPONSIBILITIES + + - Act as a container for rows that has been deleted on the master + and should be deleted on the slave. + + COLLABORATION + + Row_writer + Create the event and add rows to the event. + Row_reader + Extract the rows from the event. + + ****************************************************************************/ +class Delete_rows_log_event : public Rows_log_event +{ +public: + enum + { + /* Support interface to THD::binlog_prepare_pending_rows_event */ + TYPE_CODE = DELETE_ROWS_EVENT + }; + +#ifndef MYSQL_CLIENT + Delete_rows_log_event(THD*, TABLE*, ulong, + MY_BITMAP const *cols, bool is_transactional); +#endif +#ifdef HAVE_REPLICATION + Delete_rows_log_event(const char *buf, uint event_len, + const Format_description_log_event *description_event); +#endif +#if !defined(MYSQL_CLIENT) && defined(HAVE_ROW_BASED_REPLICATION) + static bool binlog_row_logging_function(THD *thd, TABLE *table, + bool is_transactional, + MY_BITMAP *cols, + uint fields, + const byte *before_record, + const byte *after_record + __attribute__((unused))) + { + return thd->binlog_delete_row(table, is_transactional, + cols, fields, before_record); + } +#endif + +private: + virtual Log_event_type get_type_code() { return (Log_event_type)TYPE_CODE; } + +#ifdef MYSQL_CLIENT + void print(FILE *file, PRINT_EVENT_INFO *print_event_info); +#endif + +#if !defined(MYSQL_CLIENT) && defined(HAVE_REPLICATION) + gptr m_memory; + byte *m_key; + byte *m_search_record; + + virtual int do_before_row_operations(TABLE *table); + virtual int do_after_row_operations(TABLE *table, int error); + virtual char const *do_prepare_row(THD*, TABLE*, char const *row_start); + virtual int do_exec_row(TABLE *table); +#endif +}; + + #endif /* _log_event_h */ diff --git a/sql/mysql_priv.h b/sql/mysql_priv.h index f18447de71c..48d1ea8e798 100644 --- a/sql/mysql_priv.h +++ b/sql/mysql_priv.h @@ -248,50 +248,50 @@ extern CHARSET_INFO *national_charset_info, *table_alias_charset; TODO: separate three contexts above, move them to separate bitfields. */ -#define SELECT_DISTINCT (1L << 0) // SELECT, user -#define SELECT_STRAIGHT_JOIN (1L << 1) // SELECT, user -#define SELECT_DESCRIBE (1L << 2) // SELECT, user -#define SELECT_SMALL_RESULT (1L << 3) // SELECT, user -#define SELECT_BIG_RESULT (1L << 4) // SELECT, user -#define OPTION_FOUND_ROWS (1L << 5) // SELECT, user -#define OPTION_TO_QUERY_CACHE (1L << 6) // SELECT, user -#define SELECT_NO_JOIN_CACHE (1L << 7) // intern -#define OPTION_BIG_TABLES (1L << 8) // THD, user -#define OPTION_BIG_SELECTS (1L << 9) // THD, user -#define OPTION_LOG_OFF (1L << 10) // THD, user -#define OPTION_UPDATE_LOG (1L << 11) // THD, user, unused -#define TMP_TABLE_ALL_COLUMNS (1L << 12) // SELECT, intern -#define OPTION_WARNINGS (1L << 13) // THD, user -#define OPTION_AUTO_IS_NULL (1L << 14) // THD, user, binlog -#define OPTION_FOUND_COMMENT (1L << 15) // SELECT, intern, parser -#define OPTION_SAFE_UPDATES (1L << 16) // THD, user -#define OPTION_BUFFER_RESULT (1L << 17) // SELECT, user -#define OPTION_BIN_LOG (1L << 18) // THD, user -#define OPTION_NOT_AUTOCOMMIT (1L << 19) // THD, user -#define OPTION_BEGIN (1L << 20) // THD, intern -#define OPTION_TABLE_LOCK (1L << 21) // THD, intern -#define OPTION_QUICK (1L << 22) // SELECT (for DELETE) -#define OPTION_QUOTE_SHOW_CREATE (1L << 23) // THD, user +#define SELECT_DISTINCT (LL(1) << 0) // SELECT, user +#define SELECT_STRAIGHT_JOIN (LL(1) << 1) // SELECT, user +#define SELECT_DESCRIBE (LL(1) << 2) // SELECT, user +#define SELECT_SMALL_RESULT (LL(1) << 3) // SELECT, user +#define SELECT_BIG_RESULT (LL(1) << 4) // SELECT, user +#define OPTION_FOUND_ROWS (LL(1) << 5) // SELECT, user +#define OPTION_TO_QUERY_CACHE (LL(1) << 6) // SELECT, user +#define SELECT_NO_JOIN_CACHE (LL(1) << 7) // intern +#define OPTION_BIG_TABLES (LL(1) << 8) // THD, user +#define OPTION_BIG_SELECTS (LL(1) << 9) // THD, user +#define OPTION_LOG_OFF (LL(1) << 10) // THD, user +#define OPTION_UPDATE_LOG (LL(1) << 11) // THD, user, unused +#define TMP_TABLE_ALL_COLUMNS (LL(1) << 12) // SELECT, intern +#define OPTION_WARNINGS (LL(1) << 13) // THD, user +#define OPTION_AUTO_IS_NULL (LL(1) << 14) // THD, user, binlog +#define OPTION_FOUND_COMMENT (LL(1) << 15) // SELECT, intern, parser +#define OPTION_SAFE_UPDATES (LL(1) << 16) // THD, user +#define OPTION_BUFFER_RESULT (LL(1) << 17) // SELECT, user +#define OPTION_BIN_LOG (LL(1) << 18) // THD, user +#define OPTION_NOT_AUTOCOMMIT (LL(1) << 19) // THD, user +#define OPTION_BEGIN (LL(1) << 20) // THD, intern +#define OPTION_TABLE_LOCK (LL(1) << 21) // THD, intern +#define OPTION_QUICK (LL(1) << 22) // SELECT (for DELETE) +#define OPTION_QUOTE_SHOW_CREATE (LL(1) << 23) // THD, user /* Thr following is used to detect a conflict with DISTINCT in the user query has requested */ -#define SELECT_ALL (1L << 24) // SELECT, user, parser +#define SELECT_ALL (LL(1) << 24) // SELECT, user, parser /* Set if we are updating a non-transaction safe table */ -#define OPTION_STATUS_NO_TRANS_UPDATE (1L << 25) // THD, intern +#define OPTION_STATUS_NO_TRANS_UPDATE (LL(1) << 25) // THD, intern /* The following can be set when importing tables in a 'wrong order' to suppress foreign key checks */ -#define OPTION_NO_FOREIGN_KEY_CHECKS (1L << 26) // THD, user, binlog +#define OPTION_NO_FOREIGN_KEY_CHECKS (LL(1) << 26) // THD, user, binlog /* The following speeds up inserts to InnoDB tables by suppressing unique key checks in some cases */ -#define OPTION_RELAXED_UNIQUE_CHECKS (1L << 27) // THD, user, binlog -#define SELECT_NO_UNLOCK (1L << 28) // SELECT, intern -#define OPTION_SCHEMA_TABLE (1L << 29) // SELECT, intern +#define OPTION_RELAXED_UNIQUE_CHECKS (LL(1) << 27) // THD, user, binlog +#define SELECT_NO_UNLOCK (LL(1) << 28) // SELECT, intern +#define OPTION_SCHEMA_TABLE (LL(1) << 29) // SELECT, intern /* Flag set if setup_tables already done */ -#define OPTION_SETUP_TABLES_DONE (1L << 30) // intern +#define OPTION_SETUP_TABLES_DONE (LL(1) << 30) // intern /* If not set then the thread will ignore all warnings with level notes. */ -#define OPTION_SQL_NOTES (1UL << 31) // THD, user +#define OPTION_SQL_NOTES (LL(1) << 31) // THD, user /* Force the used temporary table to be a MyISAM table (because we will use fulltext functions when reading from it. @@ -600,6 +600,7 @@ bool mysql_create_db(THD *thd, char *db, HA_CREATE_INFO *create, bool silent); bool mysql_alter_db(THD *thd, const char *db, HA_CREATE_INFO *create); bool mysql_rm_db(THD *thd,char *db,bool if_exists, bool silent); void mysql_binlog_send(THD* thd, char* log_ident, my_off_t pos, ushort flags); +void mysql_client_binlog_statement(THD *thd); bool mysql_rm_table(THD *thd,TABLE_LIST *tables, my_bool if_exists, my_bool drop_temporary); int mysql_rm_table_part2(THD *thd, TABLE_LIST *tables, bool if_exists, @@ -1198,6 +1199,13 @@ extern ulong what_to_log,flush_time; extern ulong query_buff_size, thread_stack; extern ulong binlog_cache_size, max_binlog_cache_size, open_files_limit; extern ulong max_binlog_size, max_relay_log_size; +extern const char *opt_binlog_format; +#ifdef HAVE_ROW_BASED_REPLICATION +extern my_bool binlog_row_based; +extern ulong opt_binlog_rows_event_max_size; +#else +extern const my_bool binlog_row_based; +#endif extern ulong rpl_recovery_rank, thread_cache_size; extern ulong back_log; extern ulong specialflag, current_pid; @@ -1338,6 +1346,7 @@ extern handlerton myisammrg_hton; extern handlerton heap_hton; extern SHOW_COMP_OPTION have_isam; +extern SHOW_COMP_OPTION have_row_based_replication; extern SHOW_COMP_OPTION have_raid, have_openssl, have_symlink; extern SHOW_COMP_OPTION have_query_cache; extern SHOW_COMP_OPTION have_geometry, have_rtree_keys; diff --git a/sql/mysqld.cc b/sql/mysqld.cc index 20d09ae0228..2a51afbee8d 100644 --- a/sql/mysqld.cc +++ b/sql/mysqld.cc @@ -441,6 +441,33 @@ volatile bool mqh_used = 0; my_bool opt_noacl; my_bool sp_automatic_privileges= 1; +#ifdef HAVE_ROW_BASED_REPLICATION +/* + This variable below serves as an optimization for (opt_binlog_format == + BF_ROW) as we need to do this test for every row. Stmt-based is default. +*/ +my_bool binlog_row_based= FALSE; +ulong opt_binlog_rows_event_max_size; +const char *binlog_format_names[]= {"STATEMENT", "ROW", NullS}; +/* + Note that BF_UNSPECIFIED is last, after the end of binlog_format_names: it + has no corresponding cell in this array. We use this value to be able to + know if the user has explicitely specified a binlog format (then we require + also --log-bin) or not (then we fall back to statement-based). +*/ +enum binlog_format { BF_STMT= 0, BF_ROW= 1, BF_UNSPECIFIED= 2 }; +#else +const my_bool binlog_row_based= FALSE; +const char *binlog_format_names[]= {"STATEMENT", NullS}; +enum binlog_format { BF_STMT= 0, BF_UNSPECIFIED= 2 }; +#endif + +TYPELIB binlog_format_typelib= + { array_elements(binlog_format_names)-1,"", + binlog_format_names, NULL }; +const char *opt_binlog_format= 0; +enum binlog_format opt_binlog_format_id= BF_UNSPECIFIED; + #ifdef HAVE_INITGROUPS static bool calling_initgroups= FALSE; /* Used in SIGSEGV handler. */ #endif @@ -528,6 +555,7 @@ MY_BITMAP temp_pool; CHARSET_INFO *system_charset_info, *files_charset_info ; CHARSET_INFO *national_charset_info, *table_alias_charset; +SHOW_COMP_OPTION have_row_based_replication; SHOW_COMP_OPTION have_raid, have_openssl, have_symlink, have_query_cache; SHOW_COMP_OPTION have_geometry, have_rtree_keys; SHOW_COMP_OPTION have_crypt, have_compress; @@ -3032,8 +3060,44 @@ with --log-bin instead."); { sql_print_warning("You need to use --log-bin to make " "--log-slave-updates work."); - unireg_abort(1); + unireg_abort(1); + } + + if (!opt_bin_log && (opt_binlog_format_id != BF_UNSPECIFIED)) + { + sql_print_warning("You need to use --log-bin to make " + "--binlog-format work."); + unireg_abort(1); } + if (opt_binlog_format_id == BF_UNSPECIFIED) + { + /* + We use statement-based by default, but could change this to be row-based + if this is a cluster build (i.e. have_ndbcluster is true)... + */ + opt_binlog_format_id= BF_STMT; + } +#ifdef HAVE_ROW_BASED_REPLICATION + if (opt_binlog_format_id == BF_ROW) + { + binlog_row_based= TRUE; + /* + Row-based binlogging turns on InnoDB unsafe locking, because the locks + are not needed when using row-based binlogging. In fact + innodb-locks-unsafe-for-binlog is unsafe only for stmt-based, it's + safe for row-based. + */ +#ifdef HAVE_INNOBASE_DB + innobase_locks_unsafe_for_binlog= TRUE; +#endif + /* Trust stored function creators because they can do no harm */ + trust_function_creators= 1; + } +#endif + /* Check that we have not let the format to unspecified at this point */ + DBUG_ASSERT((uint)opt_binlog_format_id <= + array_elements(binlog_format_names)-1); + opt_binlog_format= binlog_format_names[opt_binlog_format_id]; if (opt_slow_log) mysql_slow_log.open_slow_log(opt_slow_logname); @@ -4504,6 +4568,13 @@ enum options_mysqld OPT_SQL_BIN_UPDATE_SAME, OPT_REPLICATE_DO_DB, OPT_REPLICATE_IGNORE_DB, OPT_LOG_SLAVE_UPDATES, OPT_BINLOG_DO_DB, OPT_BINLOG_IGNORE_DB, + OPT_BINLOG_FORMAT, +#ifndef DBUG_OFF + OPT_BINLOG_SHOW_XID, +#endif +#ifdef HAVE_ROW_BASED_REPLICATION + OPT_BINLOG_ROWS_EVENT_MAX_SIZE, +#endif OPT_WANT_CORE, OPT_CONCURRENT_INSERT, OPT_MEMLOCK, OPT_MYISAM_RECOVER, OPT_REPLICATE_REWRITE_DB, OPT_SERVER_ID, @@ -4732,12 +4803,46 @@ Disable with --skip-bdb (will save memory).", {"bind-address", OPT_BIND_ADDRESS, "IP address to bind to.", (gptr*) &my_bind_addr_str, (gptr*) &my_bind_addr_str, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + {"binlog-format", OPT_BINLOG_FORMAT, +#ifdef HAVE_ROW_BASED_REPLICATION + "Tell the master the form of binary logging to use: either 'row' for " + "row-based binary logging (which automatically turns on " + "innodb_locks_unsafe_for_binlog as it is safe in this case), or " + "'statement' for statement-based logging. ", +#else + "Tell the master the form of binary logging to use: this release build " + "supports only statement-based binary logging, so only 'statement' is " + "a legal value; MySQL-Max release builds support row-based binary logging " + "in addition.", +#endif + 0, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0 }, {"binlog-do-db", OPT_BINLOG_DO_DB, "Tells the master it should log updates for the specified database, and exclude all others not explicitly mentioned.", 0, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, {"binlog-ignore-db", OPT_BINLOG_IGNORE_DB, "Tells the master that updates to the given database should not be logged tothe binary log.", 0, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, +#if !defined(DBUG_OFF) && !defined(MYSQL_CLIENT) + {"binlog-show-xid", OPT_BINLOG_SHOW_XID, + "Option used by mysql-test for debugging and testing: " + "do not display the XID in SHOW BINLOG EVENTS; " + "may be removed in future versions", + (gptr*) &Xid_log_event::show_xid, (gptr*) &Xid_log_event::show_xid, + 0, GET_BOOL, NO_ARG, 1, 0, 0, 0, 0, 0}, +#endif +#ifdef HAVE_ROW_BASED_REPLICATION + {"binlog-row-event-max-size", OPT_BINLOG_ROWS_EVENT_MAX_SIZE, + "The maximum size of a row-based binary log event in bytes. Rows will be " + "grouped into events smaller than this size if possible. " + "The value has to be a multiple of 256.", + (gptr*) &opt_binlog_rows_event_max_size, + (gptr*) &opt_binlog_rows_event_max_size, 0, + GET_ULONG, REQUIRED_ARG, + /* def_value */ 1024, /* min_value */ 256, /* max_value */ ULONG_MAX, + /* sub_size */ 0, /* block_size */ 256, + /* app_type */ 0 + }, +#endif {"bootstrap", OPT_BOOTSTRAP, "Used by mysql installation scripts.", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, {"character-set-client-handshake", OPT_CHARACTER_SET_CLIENT_HANDSHAKE, @@ -4905,7 +5010,9 @@ Disable with --skip-innodb-doublewrite.", (gptr*) &innobase_use_doublewrite, (gptr*) &innobase_unix_file_flush_method, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, {"innodb_locks_unsafe_for_binlog", OPT_INNODB_LOCKS_UNSAFE_FOR_BINLOG, - "Force InnoDB not to use next-key locking. Instead use only row-level locking", + "Force InnoDB not to use next-key locking, to use only row-level locking." + " This is unsafe if you are using statement-based binary logging, and safe" + " if you are using row-based binary logging.", (gptr*) &innobase_locks_unsafe_for_binlog, (gptr*) &innobase_locks_unsafe_for_binlog, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0}, {"innodb_log_arch_dir", OPT_INNODB_LOG_ARCH_DIR, @@ -4984,8 +5091,12 @@ Disable with --skip-innodb-doublewrite.", (gptr*) &innobase_use_doublewrite, {"log-bin-trust-function-creators", OPT_LOG_BIN_TRUST_FUNCTION_CREATORS, "If equal to 0 (the default), then when --log-bin is used, creation of " "a function is allowed only to users having the SUPER privilege and only " - "if this function may not break binary logging.", - (gptr*) &trust_function_creators, (gptr*) &trust_function_creators, 0, + "if this function may not break binary logging." +#ifdef HAVE_ROW_BASED_REPLICATION + " If using --binlog-format=row, the security issues do not exist and the " + "binary logging cannot break so this option is automatically set to 1." +#endif + ,(gptr*) &trust_function_creators, (gptr*) &trust_function_creators, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0}, {"log-error", OPT_ERROR_LOG_FILE, "Error log file.", (gptr*) &log_error_file_ptr, (gptr*) &log_error_file_ptr, 0, GET_STR, @@ -6459,6 +6570,11 @@ static void mysql_init_variables(void) "d:t:i:o,/tmp/mysqld.trace"); #endif opt_error_log= IF_WIN(1,0); +#ifdef HAVE_ROW_BASED_REPLICATION + have_row_based_replication= SHOW_OPTION_YES; +#else + have_row_based_replication= SHOW_OPTION_NO; +#endif #ifdef WITH_NDBCLUSTER_STORAGE_ENGINE have_ndbcluster=SHOW_OPTION_DISABLED; global_system_variables.ndb_index_stat_enable=TRUE; @@ -6682,6 +6798,28 @@ get_one_option(int optid, const struct my_option *opt __attribute__((unused)), binlog_filter->add_ignore_db(argument); break; } + case OPT_BINLOG_FORMAT: + { + int id; + if ((id= find_type(argument, &binlog_format_typelib, 2)) <= 0) + { +#ifdef HAVE_ROW_BASED_REPLICATION + fprintf(stderr, + "Unknown binary log format: '%s' " + "(should be '%s' or '%s')\n", + argument, + binlog_format_names[BF_STMT], + binlog_format_names[BF_ROW]); +#else + fprintf(stderr, + "Unknown binary log format: '%s' (only legal value is '%s')\n", + argument, binlog_format_names[BF_STMT]); +#endif + exit(1); + } + opt_binlog_format_id= (enum binlog_format)(id-1); + break; + } case (int)OPT_BINLOG_DO_DB: { binlog_filter->add_do_db(argument); @@ -7229,6 +7367,7 @@ static void get_options(int argc,char **argv) init_global_datetime_format(MYSQL_TIMESTAMP_DATETIME, &global_system_variables.datetime_format)) exit(1); + } diff --git a/sql/rpl_filter.h b/sql/rpl_filter.h index 5a766424d19..58d2b97c9c6 100644 --- a/sql/rpl_filter.h +++ b/sql/rpl_filter.h @@ -18,7 +18,6 @@ #define RPL_FILTER_H #include "mysql.h" -#include "my_list.h" typedef struct st_table_rule_ent { diff --git a/sql/rpl_rli.h b/sql/rpl_rli.h new file mode 100644 index 00000000000..5500fdf1f64 --- /dev/null +++ b/sql/rpl_rli.h @@ -0,0 +1,312 @@ +/* Copyright (C) 2005 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#ifndef RPL_RLI_H +#define RPL_RLI_H + +#define MAX_SLAVE_ERRMSG 1024 + +#include "rpl_tblmap.h" + +/**************************************************************************** + + Replication SQL Thread + + st_relay_log_info contains: + - the current relay log + - the current relay log offset + - master log name + - master log sequence corresponding to the last update + - misc information specific to the SQL thread + + st_relay_log_info is initialized from the slave.info file if such exists. + Otherwise, data members are intialized with defaults. The initialization is + done with init_relay_log_info() call. + + The format of slave.info file: + + relay_log_name + relay_log_pos + master_log_name + master_log_pos + + To clean up, call end_relay_log_info() + +*****************************************************************************/ + +typedef struct st_relay_log_info +{ + /* + If flag set, then rli does not store its state in any info file. + This is the case only when we execute BINLOG SQL commands inside + a client, non-replication thread. + */ + bool no_storage; + + /*** The following variables can only be read when protect by data lock ****/ + + /* + info_fd - file descriptor of the info file. set only during + initialization or clean up - safe to read anytime + cur_log_fd - file descriptor of the current read relay log + */ + File info_fd,cur_log_fd; + + /* + Protected with internal locks. + Must get data_lock when resetting the logs. + */ + MYSQL_LOG relay_log; + LOG_INFO linfo; + IO_CACHE cache_buf,*cur_log; + + /* The following variables are safe to read any time */ + + /* IO_CACHE of the info file - set only during init or end */ + IO_CACHE info_file; + + /* + When we restart slave thread we need to have access to the previously + created temporary tables. Modified only on init/end and by the SQL + thread, read only by SQL thread. + */ + TABLE *save_temporary_tables; + + /* + standard lock acquistion order to avoid deadlocks: + run_lock, data_lock, relay_log.LOCK_log, relay_log.LOCK_index + */ + pthread_mutex_t data_lock,run_lock; + + /* + start_cond is broadcast when SQL thread is started + stop_cond - when stopped + data_cond - when data protected by data_lock changes + */ + pthread_cond_t start_cond, stop_cond, data_cond; + + /* parent master info structure */ + struct st_master_info *mi; + + /* + Needed to deal properly with cur_log getting closed and re-opened with + a different log under our feet + */ + uint32 cur_log_old_open_count; + + /* + Let's call a group (of events) : + - a transaction + or + - an autocommiting query + its associated events (INSERT_ID, + TIMESTAMP...) + We need these rli coordinates : + - relay log name and position of the beginning of the group we currently are + executing. Needed to know where we have to restart when replication has + stopped in the middle of a group (which has been rolled back by the slave). + - relay log name and position just after the event we have just + executed. This event is part of the current group. + Formerly we only had the immediately above coordinates, plus a 'pending' + variable, but this dealt wrong with the case of a transaction starting on a + relay log and finishing (commiting) on another relay log. Case which can + happen when, for example, the relay log gets rotated because of + max_binlog_size. + */ + char group_relay_log_name[FN_REFLEN]; + ulonglong group_relay_log_pos; + char event_relay_log_name[FN_REFLEN]; + ulonglong event_relay_log_pos; + ulonglong future_event_relay_log_pos; + + /* + Original log name and position of the group we're currently executing + (whose coordinates are group_relay_log_name/pos in the relay log) + in the master's binlog. These concern the *group*, because in the master's + binlog the log_pos that comes with each event is the position of the + beginning of the group. + */ + char group_master_log_name[FN_REFLEN]; + volatile my_off_t group_master_log_pos; + + /* + Handling of the relay_log_space_limit optional constraint. + ignore_log_space_limit is used to resolve a deadlock between I/O and SQL + threads, the SQL thread sets it to unblock the I/O thread and make it + temporarily forget about the constraint. + */ + ulonglong log_space_limit,log_space_total; + bool ignore_log_space_limit; + + /* + When it commits, InnoDB internally stores the master log position it has + processed so far; the position to store is the one of the end of the + committing event (the COMMIT query event, or the event if in autocommit + mode). + */ +#if MYSQL_VERSION_ID < 40100 + ulonglong future_master_log_pos; +#else + ulonglong future_group_master_log_pos; +#endif + + time_t last_master_timestamp; + + /* + Needed for problems when slave stops and we want to restart it + skipping one or more events in the master log that have caused + errors, and have been manually applied by DBA already. + */ + volatile uint32 slave_skip_counter; + volatile ulong abort_pos_wait; /* Incremented on change master */ + volatile ulong slave_run_id; /* Incremented on slave start */ + pthread_mutex_t log_space_lock; + pthread_cond_t log_space_cond; + THD * sql_thd; + int last_slave_errno; +#ifndef DBUG_OFF + int events_till_abort; +#endif + char last_slave_error[MAX_SLAVE_ERRMSG]; + + /* if not set, the value of other members of the structure are undefined */ + bool inited; + volatile bool abort_slave; + volatile uint slave_running; + + /* + Condition and its parameters from START SLAVE UNTIL clause. + + UNTIL condition is tested with is_until_satisfied() method that is + called by exec_relay_log_event(). is_until_satisfied() caches the result + of the comparison of log names because log names don't change very often; + this cache is invalidated by parts of code which change log names with + notify_*_log_name_updated() methods. (They need to be called only if SQL + thread is running). + */ + + enum {UNTIL_NONE= 0, UNTIL_MASTER_POS, UNTIL_RELAY_POS} until_condition; + char until_log_name[FN_REFLEN]; + ulonglong until_log_pos; + /* extension extracted from log_name and converted to int */ + ulong until_log_name_extension; + /* + Cached result of comparison of until_log_name and current log name + -2 means unitialised, -1,0,1 are comarison results + */ + enum + { + UNTIL_LOG_NAMES_CMP_UNKNOWN= -2, UNTIL_LOG_NAMES_CMP_LESS= -1, + UNTIL_LOG_NAMES_CMP_EQUAL= 0, UNTIL_LOG_NAMES_CMP_GREATER= 1 + } until_log_names_cmp_result; + + char cached_charset[6]; + /* + trans_retries varies between 0 to slave_transaction_retries and counts how + many times the slave has retried the present transaction; gets reset to 0 + when the transaction finally succeeds. retried_trans is a cumulative + counter: how many times the slave has retried a transaction (any) since + slave started. + */ + ulong trans_retries, retried_trans; + + /* + If the end of the hot relay log is made of master's events ignored by the + slave I/O thread, these two keep track of the coords (in the master's + binlog) of the last of these events seen by the slave I/O thread. If not, + ign_master_log_name_end[0] == 0. + As they are like a Rotate event read/written from/to the relay log, they + are both protected by rli->relay_log.LOCK_log. + */ + char ign_master_log_name_end[FN_REFLEN]; + ulonglong ign_master_log_pos_end; + + st_relay_log_info(); + ~st_relay_log_info(); + + /* + Invalidate cached until_log_name and group_relay_log_name comparison + result. Should be called after any update of group_realy_log_name if + there chances that sql_thread is running. + */ + inline void notify_group_relay_log_name_update() + { + if (until_condition==UNTIL_RELAY_POS) + until_log_names_cmp_result= UNTIL_LOG_NAMES_CMP_UNKNOWN; + } + + /* + The same as previous but for group_master_log_name. + */ + inline void notify_group_master_log_name_update() + { + if (until_condition==UNTIL_MASTER_POS) + until_log_names_cmp_result= UNTIL_LOG_NAMES_CMP_UNKNOWN; + } + + inline void inc_event_relay_log_pos() + { + event_relay_log_pos= future_event_relay_log_pos; + } + + void inc_group_relay_log_pos(ulonglong log_pos, + bool skip_lock=0); + + int wait_for_pos(THD* thd, String* log_name, longlong log_pos, + longlong timeout); + void close_temporary_tables(); + + /* Check if UNTIL condition is satisfied. See slave.cc for more. */ + bool is_until_satisfied(); + inline ulonglong until_pos() + { + return ((until_condition == UNTIL_MASTER_POS) ? group_master_log_pos : + group_relay_log_pos); + } + + table_mapping m_table_map; + + /* + Last charset (6 bytes) seen by slave SQL thread is cached here; it helps + the thread save 3 get_charset() per Query_log_event if the charset is not + changing from event to event (common situation). + When the 6 bytes are equal to 0 is used to mean "cache is invalidated". + */ + void cached_charset_invalidate(); + bool cached_charset_compare(char *charset); + + /* + To reload special tables when they are changes, we introduce a set + of functions that will mark whenever special functions need to be + called after modifying tables. Right now, the tables are either + ACL tables or grants tables. + */ + enum enum_reload_flag + { + RELOAD_NONE_F = 0UL, + RELOAD_GRANT_F = (1UL << 0), + RELOAD_ACCESS_F = (1UL << 1) + }; + + ulong m_reload_flags; + + void touching_table(char const* db, char const* table, ulong table_id); + void transaction_end(THD*); + + void cleanup_context(THD *, bool); + time_t unsafe_to_stop_at; +} RELAY_LOG_INFO; + +#endif /* RPL_RLI_H */ diff --git a/sql/rpl_tblmap.cc b/sql/rpl_tblmap.cc new file mode 100644 index 00000000000..a0272b23ee8 --- /dev/null +++ b/sql/rpl_tblmap.cc @@ -0,0 +1,151 @@ +/* Copyright (C) 2005 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#include "mysql_priv.h" + +#ifdef HAVE_REPLICATION + +#include "rpl_tblmap.h" + +#define MAYBE_TABLE_NAME(T) ((T) ? (T)->s->table_name.str : "<>") +#define TABLE_ID_HASH_SIZE 32 +#define TABLE_ID_CHUNK 256 + +table_mapping::table_mapping() + : m_free(0) +{ + /* + No "free_element" function for entries passed here, as the entries are + allocated in a MEM_ROOT (freed as a whole in the destructor), they cannot + be freed one by one. + Note that below we don't test if hash_init() succeeded. This constructor + is called at startup only. + */ + (void) hash_init(&m_table_ids,&my_charset_bin,TABLE_ID_HASH_SIZE, + offsetof(entry,table_id),sizeof(ulong), + 0,0,0); + /* We don't preallocate any block, this is consistent with m_free=0 above */ + init_alloc_root(&m_mem_root, TABLE_ID_HASH_SIZE*sizeof(entry), 0); +} + +table_mapping::~table_mapping() +{ + hash_free(&m_table_ids); + free_root(&m_mem_root, MYF(0)); +} + +st_table* table_mapping::get_table(ulong table_id) +{ + DBUG_ENTER("table_mapping::get_table(ulong)"); + DBUG_PRINT("enter", ("table_id=%d", table_id)); + entry *e= find_entry(table_id); + if (e) + { + DBUG_PRINT("info", ("tid %d -> table %p (%s)", + table_id, e->table, + MAYBE_TABLE_NAME(e->table))); + DBUG_RETURN(e->table); + } + + DBUG_PRINT("info", ("tid %d is not mapped!", table_id)); + DBUG_RETURN(NULL); +} + +/* + Called when we are out of table id entries. Creates TABLE_ID_CHUNK + new entries, chain them and attach them at the head of the list of free + (free for use) entries. +*/ +int table_mapping::expand() +{ + /* + If we wanted to use "tmp= new (&m_mem_root) entry[TABLE_ID_CHUNK]", + we would have to make "entry" derive from Sql_alloc but then it would not + be a POD anymore and we want it to be (see rpl_tblmap.h). So we allocate + in C. + */ + entry *tmp= (entry *)alloc_root(&m_mem_root, TABLE_ID_CHUNK*sizeof(entry)); + if (tmp == NULL) + return ERR_MEMORY_ALLOCATION; // Memory allocation failed + + /* Find the end of this fresh new array of free entries */ + entry *e_end= tmp+TABLE_ID_CHUNK-1; + for (entry *e= tmp; e < e_end; e++) + e->next= e+1; + e_end->next= m_free; + m_free= tmp; + return 0; +} + +int table_mapping::set_table(ulong table_id, TABLE* table) +{ + DBUG_ENTER("table_mapping::set_table(ulong,TABLE*)"); + DBUG_PRINT("enter", ("table_id=%d, table=%p (%s)", + table_id, + table, MAYBE_TABLE_NAME(table))); + entry *e= find_entry(table_id); + if (e == 0) + { + if (m_free == 0 && expand()) + DBUG_RETURN(ERR_MEMORY_ALLOCATION); // Memory allocation failed + e= m_free; + m_free= m_free->next; + } + else + hash_delete(&m_table_ids,(byte *)e); + + e->table_id= table_id; + e->table= table; + my_hash_insert(&m_table_ids,(byte *)e); + + DBUG_PRINT("info", ("tid %d -> table %p (%s)", + table_id, e->table, + MAYBE_TABLE_NAME(e->table))); + DBUG_RETURN(0); // All OK +} + +int table_mapping::remove_table(ulong table_id) +{ + entry *e= find_entry(table_id); + if (e) + { + hash_delete(&m_table_ids,(byte *)e); + /* we add this entry to the chain of free (free for use) entries */ + e->next= m_free; + m_free= e; + return 0; // All OK + } + return 1; // No table to remove +} + +/* + Puts all entries into the list of free-for-use entries (does not free any + memory), and empties the hash. +*/ +void table_mapping::clear_tables() +{ + DBUG_ENTER("table_mapping::clear_tables()"); + for (uint i= 0; i < m_table_ids.records; i++) + { + entry *e= (entry *)hash_element(&m_table_ids, i); + e->next= m_free; + m_free= e; + } + my_hash_reset(&m_table_ids); + DBUG_VOID_RETURN; +} + +#endif diff --git a/sql/rpl_tblmap.h b/sql/rpl_tblmap.h new file mode 100644 index 00000000000..cfc2d7c2c6c --- /dev/null +++ b/sql/rpl_tblmap.h @@ -0,0 +1,109 @@ +/* Copyright (C) 2005 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#ifndef TABLE_MAPPING_H +#define TABLE_MAPPING_H + +/* Forward declarations */ +struct st_table; +typedef st_table TABLE; + +/* + CLASS table_mapping + + RESPONSIBILITIES + The table mapping is used to map table id's to table pointers + + COLLABORATION + RELAY_LOG For mapping table id:s to tables when receiving events. + */ + +/* + Guilhem to Mats: + in the table_mapping class, the memory is allocated and never freed (until + destruction). So this is a good candidate for allocating inside a MEM_ROOT: + it gives the efficient allocation in chunks (like in expand()). So I have + introduced a MEM_ROOT. + + Note that inheriting from Sql_alloc had no effect: it has effects only when + "ptr= new table_mapping" is called, and this is never called. And it would + then allocate from thd->mem_root which is a highly volatile object (reset + from example after executing each query, see dispatch_command(), it has a + free_root() at end); as the table_mapping object is supposed to live longer + than a query, it was dangerous. + A dedicated MEM_ROOT needs to be used, see below. +*/ + +class table_mapping { + +private: + MEM_ROOT m_mem_root; + +public: + + enum { + NO_TABLE = ULONG_MAX + }; + + enum enum_error { + ERR_NO_ERROR = 0, + ERR_LIMIT_EXCEEDED, + ERR_MEMORY_ALLOCATION + }; + + table_mapping(); + ~table_mapping(); + + TABLE* get_table(ulong table_id); + + int set_table(ulong table_id, TABLE* table); + int remove_table(ulong table_id); + void clear_tables(); + ulong count() const { return m_table_ids.records; } + +private: + /* + This is a POD (Plain Old Data). Keep it that way (we apply offsetof() to + it, which only works for PODs) + */ + struct entry { + ulong table_id; + union { + TABLE *table; + entry *next; + }; + }; + + entry *find_entry(ulong table_id) + { + return (entry *)hash_search(&m_table_ids, + (byte*)&table_id, + sizeof(table_id)); + } + int expand(); + + /* + Head of the list of free entries; "free" in the sense that it's an + allocated entry free for use, NOT in the sense that it's freed + memory. + */ + entry *m_free; + + /* Correspondance between an id (a number) and a TABLE object */ + HASH m_table_ids; +}; + +#endif diff --git a/sql/set_var.cc b/sql/set_var.cc index 990d8047009..01ff30045c4 100644 --- a/sql/set_var.cc +++ b/sql/set_var.cc @@ -619,6 +619,7 @@ sys_var_have_variable sys_have_query_cache("have_query_cache", sys_var_have_variable sys_have_raid("have_raid", &have_raid); sys_var_have_variable sys_have_rtree_keys("have_rtree_keys", &have_rtree_keys); sys_var_have_variable sys_have_symlink("have_symlink", &have_symlink); +sys_var_have_variable sys_have_row_based_replication("have_row_based_replication",&have_row_based_replication); /* Global read-only variable describing server license */ sys_var_const_str sys_license("license", STRINGIFY_ARG(LICENSE)); @@ -643,6 +644,7 @@ struct show_var_st init_vars[]= { {"bdb_shared_data", (char*) &berkeley_shared_data, SHOW_BOOL}, {"bdb_tmpdir", (char*) &berkeley_tmpdir, SHOW_CHAR_PTR}, {sys_binlog_cache_size.name,(char*) &sys_binlog_cache_size, SHOW_SYS}, + {"binlog_format", (char*) &opt_binlog_format, SHOW_CHAR_PTR}, {sys_bulk_insert_buff_size.name,(char*) &sys_bulk_insert_buff_size,SHOW_SYS}, {sys_character_set_client.name,(char*) &sys_character_set_client, SHOW_SYS}, {sys_character_set_connection.name,(char*) &sys_character_set_connection,SHOW_SYS}, @@ -695,6 +697,7 @@ struct show_var_st init_vars[]= { {sys_have_raid.name, (char*) &have_raid, SHOW_HAVE}, {sys_have_rtree_keys.name, (char*) &have_rtree_keys, SHOW_HAVE}, {sys_have_symlink.name, (char*) &have_symlink, SHOW_HAVE}, + {sys_have_row_based_replication.name, (char*) &have_row_based_replication, SHOW_HAVE}, {"init_connect", (char*) &sys_init_connect, SHOW_SYS}, {"init_file", (char*) &opt_init_file, SHOW_CHAR_PTR}, {"init_slave", (char*) &sys_init_slave, SHOW_SYS}, diff --git a/sql/share/errmsg.txt b/sql/share/errmsg.txt index aa5607c9f6a..ab9663db475 100644 --- a/sql/share/errmsg.txt +++ b/sql/share/errmsg.txt @@ -5727,3 +5727,9 @@ ER_WRONG_VALUE eng "Incorrect %-.32s value: '%-.128s'" ER_NO_PARTITION_FOR_GIVEN_VALUE eng "Table has no partition for value %ld" +ER_BINLOG_ROW_LOGGING_FAILED + eng "Writing one row to the row-based binary log failed" +ER_BINLOG_ROW_WRONG_TABLE_DEF + eng "Table definition on master and slave does not match" +ER_BINLOG_ROW_RBR_TO_SBR + eng "Slave running with --log-slave-updates must use row-based binary logging to be able to replicate row-based binary log events" diff --git a/sql/slave.cc b/sql/slave.cc index 6c9cfc250c5..99bddb7b9b0 100644 --- a/sql/slave.cc +++ b/sql/slave.cc @@ -16,10 +16,9 @@ #include "mysql_priv.h" -#ifdef HAVE_REPLICATION - #include <mysql.h> #include <myisam.h> +#include "rpl_rli.h" #include "slave.h" #include "sql_repl.h" #include "rpl_filter.h" @@ -28,6 +27,10 @@ #include <my_dir.h> #include <sql_common.h> +#ifdef HAVE_REPLICATION + +#include "rpl_tblmap.h" + #define MAX_SLAVE_RETRY_PAUSE 5 bool use_slave_mask = 0; MY_BITMAP slave_error_mask; @@ -48,8 +51,6 @@ ulonglong relay_log_space_limit = 0; */ int disconnect_slave_event_count = 0, abort_slave_event_count = 0; -int events_till_abort = -1; -static int events_till_disconnect = -1; typedef enum { SLAVE_THD_IO, SLAVE_THD_SQL} SLAVE_THD_TYPE; @@ -860,19 +861,48 @@ static bool sql_slave_killed(THD* thd, RELAY_LOG_INFO* rli) { DBUG_ASSERT(rli->sql_thd == thd); DBUG_ASSERT(rli->slave_running == 1);// tracking buffer overrun - return rli->abort_slave || abort_loop || thd->killed; + if (abort_loop || thd->killed || rli->abort_slave) + { + /* + If we are in an unsafe situation (stopping could corrupt replication), + we give one minute to the slave SQL thread of grace before really + terminating, in the hope that it will be able to read more events and + the unsafe situation will soon be left. Note that this one minute starts + from the last time anything happened in the slave SQL thread. So it's + really one minute of idleness, we don't timeout if the slave SQL thread + is actively working. + */ + if (!rli->unsafe_to_stop_at) + return 1; + DBUG_PRINT("info", ("Slave SQL thread is in an unsafe situation, giving " + "it some grace period")); + if (difftime(time(0), rli->unsafe_to_stop_at) > 60) + { + slave_print_msg(ERROR_LEVEL, rli, 0, + "SQL thread had to stop in an unsafe situation, in " + "the middle of applying updates to a " + "non-transactional table without any primary key. " + "There is a risk of duplicate updates when the slave " + "SQL thread is restarted. Please check your tables' " + "contents after restart."); + return 1; + } + } + return 0; } /* - Writes an error message to rli->last_slave_error and rli->last_slave_errno - (which will be displayed by SHOW SLAVE STATUS), and prints it to stderr. + Writes a message to stderr, and if it's an error message, to + rli->last_slave_error and rli->last_slave_errno (which will be displayed by + SHOW SLAVE STATUS). SYNOPSIS - slave_print_error() - rli + slave_print_msg() + level The severity level + rli err_code The error code - msg The error message (usually related to the error code, but can + msg The message (usually related to the error code, but can contain more information). ... (this is printf-like format, with % symbols in msg) @@ -880,22 +910,47 @@ static bool sql_slave_killed(THD* thd, RELAY_LOG_INFO* rli) void */ -void slave_print_error(RELAY_LOG_INFO* rli, int err_code, const char* msg, ...) +void slave_print_msg(enum loglevel level, RELAY_LOG_INFO* rli, + int err_code, const char* msg, ...) { + void (*report_function)(const char *, ...); + char buff[MAX_SLAVE_ERRMSG], *pbuff= buff; + uint pbuffsize= sizeof(buff); va_list args; va_start(args,msg); - my_vsnprintf(rli->last_slave_error, - sizeof(rli->last_slave_error), msg, args); - rli->last_slave_errno = err_code; - /* If the error string ends with '.', do not add a ',' it would be ugly */ - if (rli->last_slave_error[0] && - (*(strend(rli->last_slave_error)-1) == '.')) - sql_print_error("Slave: %s Error_code: %d", rli->last_slave_error, - err_code); + switch (level) + { + case ERROR_LEVEL: + /* + This my_error call only has effect in client threads. + Slave threads do nothing in my_error(). + */ + my_error(ER_UNKNOWN_ERROR, MYF(0), msg); + /* + It's an error, it must be reported in Last_error and Last_errno in SHOW + SLAVE STATUS. + */ + pbuff= rli->last_slave_error; + pbuffsize= sizeof(rli->last_slave_error); + rli->last_slave_errno = err_code; + report_function= sql_print_error; + break; + case WARNING_LEVEL: + report_function= sql_print_warning; + break; + case INFORMATION_LEVEL: + report_function= sql_print_information; + break; + default: + DBUG_ASSERT(0); // should not come here + return; // don't crash production builds, just do nothing + } + my_vsnprintf(pbuff, pbuffsize, msg, args); + /* If the msg string ends with '.', do not add a ',' it would be ugly */ + if (pbuff[0] && (*(strend(pbuff)-1) == '.')) + (*report_function)("Slave: %s Error_code: %d", pbuff, err_code); else - sql_print_error("Slave: %s, Error_code: %d", rli->last_slave_error, - err_code); - + (*report_function)("Slave: %s, Error_code: %d", pbuff, err_code); } /* @@ -919,7 +974,6 @@ bool net_request_file(NET* net, const char* fname) DBUG_RETURN(net_write_command(net, 251, fname, strlen(fname), "", 0)); } - /* From other comments and tests in code, it looks like sometimes Query_log_event and Load_log_event can have db == 0 @@ -932,7 +986,6 @@ const char *print_slave_db_safe(const char* db) return (db ? db : ""); } - static int init_strvar_from_file(char *var, int max_size, IO_CACHE *f, const char *default_val) { @@ -1379,6 +1432,7 @@ static int init_relay_log_info(RELAY_LOG_INFO* rli, const char* msg = 0; int error = 0; DBUG_ENTER("init_relay_log_info"); + DBUG_ASSERT(!rli->no_storage); // Don't init if there is no storage if (rli->inited) // Set if this function called DBUG_RETURN(0); @@ -1674,7 +1728,7 @@ static void write_ignored_events_info_to_relay_log(THD *thd, MASTER_INFO *mi) if (rli->ign_master_log_name_end[0]) { DBUG_PRINT("info",("writing a Rotate event to track down ignored events")); - Rotate_log_event *ev= new Rotate_log_event(thd, rli->ign_master_log_name_end, + Rotate_log_event *ev= new Rotate_log_event(rli->ign_master_log_name_end, 0, rli->ign_master_log_pos_end, Rotate_log_event::DUP_NAME); rli->ign_master_log_name_end[0]= 0; @@ -2241,17 +2295,17 @@ bool flush_master_info(MASTER_INFO* mi, bool flush_relay_log_cache) st_relay_log_info::st_relay_log_info() - :info_fd(-1), cur_log_fd(-1), save_temporary_tables(0), + :no_storage(FALSE), info_fd(-1), cur_log_fd(-1), save_temporary_tables(0), cur_log_old_open_count(0), group_master_log_pos(0), log_space_total(0), ignore_log_space_limit(0), last_master_timestamp(0), slave_skip_counter(0), abort_pos_wait(0), slave_run_id(0), sql_thd(0), last_slave_errno(0), inited(0), abort_slave(0), slave_running(0), until_condition(UNTIL_NONE), - until_log_pos(0), retried_trans(0) + until_log_pos(0), retried_trans(0), m_reload_flags(RELOAD_NONE_F), + unsafe_to_stop_at(0) { group_relay_log_name[0]= event_relay_log_name[0]= group_master_log_name[0]= 0; last_slave_error[0]= until_log_name[0]= ign_master_log_name_end[0]= 0; - bzero((char*) &info_file, sizeof(info_file)); bzero((char*) &cache_buf, sizeof(cache_buf)); cached_charset_invalidate(); @@ -2671,11 +2725,9 @@ static ulong read_event(MYSQL* mysql, MASTER_INFO *mi, bool* suppress_warnings) /* my_real_read() will time us out We check if we were told to die, and if not, try reading again - - TODO: Move 'events_till_disconnect' to the MASTER_INFO structure */ #ifndef DBUG_OFF - if (disconnect_slave_event_count && !(events_till_disconnect--)) + if (disconnect_slave_event_count && !(mi->events_till_disconnect--)) return packet_error; #endif @@ -2950,7 +3002,7 @@ static int exec_relay_log_event(THD* thd, RELAY_LOG_INFO* rli) thd->lex->current_select= 0; if (!ev->when) ev->when = time(NULL); - ev->thd = thd; + ev->thd = thd; // because up to this point, ev->thd == 0 exec_res = ev->exec_event(rli); DBUG_ASSERT(rli->sql_thd==thd); /* @@ -3022,7 +3074,7 @@ static int exec_relay_log_event(THD* thd, RELAY_LOG_INFO* rli) else { pthread_mutex_unlock(&rli->data_lock); - slave_print_error(rli, 0, "\ + slave_print_msg(ERROR_LEVEL, rli, 0, "\ Could not parse relay log event entry. The possible reasons are: the master's \ binary log is corrupted (you can check this by running 'mysqlbinlog' on the \ binary log), the slave's relay log is corrupted (you can check this by running \ @@ -3051,9 +3103,6 @@ pthread_handler_t handle_slave_io(void *arg) my_thread_init(); DBUG_ENTER("handle_slave_io"); -#ifndef DBUG_OFF -slave_begin: -#endif DBUG_ASSERT(mi->inited); mysql= NULL ; retry_count= 0; @@ -3063,7 +3112,7 @@ slave_begin: mi->slave_run_id++; #ifndef DBUG_OFF - mi->events_till_abort = abort_slave_event_count; + mi->events_till_disconnect = disconnect_slave_event_count; #endif thd= new THD; // note that contructor of THD uses DBUG_ ! @@ -3301,14 +3350,6 @@ ignore_log_space_limit=%d", log space"); goto err; } - // TODO: check debugging abort code -#ifndef DBUG_OFF - if (abort_slave_event_count && !--events_till_abort) - { - sql_print_error("Slave I/O thread: debugging abort"); - goto err; - } -#endif } } @@ -3347,10 +3388,6 @@ err: pthread_mutex_unlock(&LOCK_thread_count); pthread_cond_broadcast(&mi->stop_cond); // tell the world we are done pthread_mutex_unlock(&mi->run_lock); -#ifndef DBUG_OFF - if (abort_slave_event_count && !events_till_abort) - goto slave_begin; -#endif my_thread_end(); pthread_exit(0); DBUG_RETURN(0); // Can't return anything here @@ -3370,10 +3407,6 @@ pthread_handler_t handle_slave_sql(void *arg) my_thread_init(); DBUG_ENTER("handle_slave_sql"); -#ifndef DBUG_OFF -slave_begin: -#endif - DBUG_ASSERT(rli->inited); pthread_mutex_lock(&rli->run_lock); DBUG_ASSERT(!rli->slave_running); @@ -3520,6 +3553,14 @@ the slave SQL thread with \"SLAVE START\". We stopped at log \ RPL_LOG_NAME, llstr(rli->group_master_log_pos,llbuff)); err: + + /* + Some events set some playgrounds, which won't be cleared because thread + stops. Stopping of this thread may not be known to these events ("stop" + request is detected only by the present function, not by events), so we + must "proactively" clear playgrounds: + */ + rli->cleanup_context(thd, 1); VOID(pthread_mutex_lock(&LOCK_thread_count)); /* Some extra safety, which should not been needed (normally, event deletion @@ -3565,10 +3606,6 @@ the slave SQL thread with \"SLAVE START\". We stopped at log \ pthread_cond_broadcast(&rli->stop_cond); // tell the world we are done pthread_mutex_unlock(&rli->run_lock); -#ifndef DBUG_OFF // TODO: reconsider the code below - if (abort_slave_event_count && !rli->events_till_abort) - goto slave_begin; -#endif my_thread_end(); pthread_exit(0); DBUG_RETURN(0); // Can't return anything here @@ -3721,7 +3758,7 @@ static int process_io_rotate(MASTER_INFO *mi, Rotate_log_event *rev) rotate event forever, so we need to not disconnect after one. */ if (disconnect_slave_event_count) - events_till_disconnect++; + mi->events_till_disconnect++; #endif /* @@ -4177,7 +4214,7 @@ static int connect_to_master(THD* thd, MYSQL* mysql, MASTER_INFO* mi, DBUG_ENTER("connect_to_master"); #ifndef DBUG_OFF - events_till_disconnect = disconnect_slave_event_count; + mi->events_till_disconnect = disconnect_slave_event_count; #endif ulong client_flag= CLIENT_REMEMBER_OPTIONS; if (opt_slave_compressed_protocol) @@ -4311,6 +4348,10 @@ static int safe_reconnect(THD* thd, MYSQL* mysql, MASTER_INFO* mi, bool flush_relay_log_info(RELAY_LOG_INFO* rli) { bool error=0; + + if (unlikely(rli->no_storage)) + return 0; + IO_CACHE *file = &rli->info_file; char buff[FN_REFLEN*2+22*2+4], *pos; @@ -4327,6 +4368,7 @@ bool flush_relay_log_info(RELAY_LOG_INFO* rli) error=1; if (flush_io_cache(file)) error=1; + /* Flushing the relay log is done by the slave I/O thread */ return error; } @@ -4357,7 +4399,7 @@ static IO_CACHE *reopen_relay_log(RELAY_LOG_INFO *rli, const char **errmsg) } -Log_event* next_event(RELAY_LOG_INFO* rli) +static Log_event* next_event(RELAY_LOG_INFO* rli) { Log_event* ev; IO_CACHE* cur_log = rli->cur_log; @@ -4368,6 +4410,11 @@ Log_event* next_event(RELAY_LOG_INFO* rli) DBUG_ENTER("next_event"); DBUG_ASSERT(thd != 0); +#ifndef DBUG_OFF + if (abort_slave_event_count && !rli->events_till_abort--) + DBUG_RETURN(0); +#endif + /* For most operations we need to protect rli members with data_lock, so we assume calling function acquired this mutex for us and we will @@ -4489,7 +4536,7 @@ Log_event* next_event(RELAY_LOG_INFO* rli) { /* We generate and return a Rotate, to make our positions advance */ DBUG_PRINT("info",("seeing an ignored end segment")); - ev= new Rotate_log_event(thd, rli->ign_master_log_name_end, + ev= new Rotate_log_event(rli->ign_master_log_name_end, 0, rli->ign_master_log_pos_end, Rotate_log_event::DUP_NAME); rli->ign_master_log_name_end[0]= 0; @@ -4737,11 +4784,114 @@ end: DBUG_VOID_RETURN; } +/* + Some system tables needed to be re-read by the MySQL server after it has + updated them; in statement-based replication, the GRANT and other commands + are sent verbatim to the slave which then reloads; in row-based replication, + changes to these tables are done through ordinary Rows binlog events, so + master must add some flag for the slave to know it has to reload the tables. +*/ +struct st_reload_entry +{ + char const *table; + st_relay_log_info::enum_reload_flag flag; +}; + +/* + Sorted array of table names, please keep it sorted since we are + using bsearch() on it below. + */ +static st_reload_entry s_mysql_tables[] = +{ + { "columns_priv", st_relay_log_info::RELOAD_GRANT_F }, + { "db", st_relay_log_info::RELOAD_ACCESS_F }, + { "host", st_relay_log_info::RELOAD_ACCESS_F }, + { "procs_priv", st_relay_log_info::RELOAD_GRANT_F }, + { "tables_priv", st_relay_log_info::RELOAD_GRANT_F }, + { "user", st_relay_log_info::RELOAD_ACCESS_F } +}; + +static const my_size_t s_mysql_tables_size = + sizeof(s_mysql_tables)/sizeof(*s_mysql_tables); + +static int reload_entry_compare(const void *lhs, const void *rhs) +{ + const char *lstr = static_cast<const char *>(lhs); + const char *rstr = static_cast<const st_reload_entry*>(rhs)->table; + return strcmp(lstr, rstr); +} + +void st_relay_log_info::touching_table(char const* db, char const* table, + ulong table_id) +{ + if (strcmp(db,"mysql") == 0) + { +#if defined(HAVE_BSEARCH) && defined(HAVE_SIZE_T) + void *const ptr= bsearch(table, s_mysql_tables, + s_mysql_tables_size, + sizeof(*s_mysql_tables), reload_entry_compare); + st_reload_entry const *const entry= static_cast<st_reload_entry*>(ptr); +#else + /* + Fall back to full scan, there are few rows anyway and updating the + "mysql" database is rare. + */ + st_reload_entry const *entry= s_mysql_tables; + for ( ; entry < s_mysql_tables + s_mysql_tables_size ; entry++) + if (reload_entry_compare(table, entry) == 0) + break; +#endif + if (entry) + m_reload_flags|= entry->flag; + } +} + +void st_relay_log_info::transaction_end(THD* thd) +{ + if (m_reload_flags != RELOAD_NONE_F) + { + if (m_reload_flags & RELOAD_ACCESS_F) + acl_reload(thd); + + if (m_reload_flags & RELOAD_GRANT_F) + grant_reload(thd); + + m_reload_flags= RELOAD_NONE_F; + } +} + +#if !defined(MYSQL_CLIENT) && defined(HAVE_REPLICATION) +void st_relay_log_info::cleanup_context(THD *thd, bool error) +{ + DBUG_ASSERT(sql_thd == thd); + /* + 1) Instances of Table_map_log_event, if ::exec_event() was called on them, + may have opened tables, which we cannot be sure have been closed (because + maybe the Rows_log_event have not been found or will not be, because slave + SQL thread is stopping, or relay log has a missing tail etc). So we close + all thread's tables. And so the table mappings have to be cancelled. + 2) Rows_log_event::exec_event() may even have started statements or + transactions on them, which we need to rollback in case of error. + 3) If finding a Format_description_log_event after a BEGIN, we also need + to rollback before continuing with the next events. + 4) so we need this "context cleanup" function. + */ + if (error) + { + ha_autocommit_or_rollback(thd, 1); // if a "statement transaction" + end_trans(thd, ROLLBACK); // if a "real transaction" + } + m_table_map.clear_tables(); + close_thread_tables(thd); + unsafe_to_stop_at= 0; +} +#endif + #ifdef HAVE_EXPLICIT_TEMPLATE_INSTANTIATION template class I_List_iterator<i_string>; template class I_List_iterator<i_string_pair>; #endif - #endif /* HAVE_REPLICATION */ + diff --git a/sql/slave.h b/sql/slave.h index c994bfb2d34..6870aaca752 100644 --- a/sql/slave.h +++ b/sql/slave.h @@ -14,17 +14,19 @@ along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ -#ifdef HAVE_REPLICATION - #ifndef SLAVE_H #define SLAVE_H -#include "mysql.h" +#ifdef HAVE_REPLICATION + +#include "log.h" #include "my_list.h" #include "rpl_filter.h" +#include "rpl_tblmap.h" +#include "rpl_rli.h" #define SLAVE_NET_TIMEOUT 3600 -#define MAX_SLAVE_ERRMSG 1024 + #define MAX_SLAVE_ERROR 2000 /***************************************************************************** @@ -110,265 +112,7 @@ struct st_master_info; #define MYSQL_SLAVE_RUN_NOT_CONNECT 1 #define MYSQL_SLAVE_RUN_CONNECT 2 -/**************************************************************************** - - Replication SQL Thread - - st_relay_log_info contains: - - the current relay log - - the current relay log offset - - master log name - - master log sequence corresponding to the last update - - misc information specific to the SQL thread - - st_relay_log_info is initialized from the slave.info file if such exists. - Otherwise, data members are intialized with defaults. The initialization is - done with init_relay_log_info() call. - - The format of slave.info file: - - relay_log_name - relay_log_pos - master_log_name - master_log_pos - - To clean up, call end_relay_log_info() - -*****************************************************************************/ - -typedef struct st_relay_log_info -{ - /*** The following variables can only be read when protect by data lock ****/ - - /* - info_fd - file descriptor of the info file. set only during - initialization or clean up - safe to read anytime - cur_log_fd - file descriptor of the current read relay log - */ - File info_fd,cur_log_fd; - - /* - Protected with internal locks. - Must get data_lock when resetting the logs. - */ - MYSQL_LOG relay_log; - LOG_INFO linfo; - IO_CACHE cache_buf,*cur_log; - - /* The following variables are safe to read any time */ - - /* IO_CACHE of the info file - set only during init or end */ - IO_CACHE info_file; - - /* - When we restart slave thread we need to have access to the previously - created temporary tables. Modified only on init/end and by the SQL - thread, read only by SQL thread. - */ - TABLE *save_temporary_tables; - - /* - standard lock acquistion order to avoid deadlocks: - run_lock, data_lock, relay_log.LOCK_log, relay_log.LOCK_index - */ - pthread_mutex_t data_lock,run_lock; - - /* - start_cond is broadcast when SQL thread is started - stop_cond - when stopped - data_cond - when data protected by data_lock changes - */ - pthread_cond_t start_cond, stop_cond, data_cond; - - /* parent master info structure */ - struct st_master_info *mi; - - /* - Needed to deal properly with cur_log getting closed and re-opened with - a different log under our feet - */ - uint32 cur_log_old_open_count; - - /* - Let's call a group (of events) : - - a transaction - or - - an autocommiting query + its associated events (INSERT_ID, - TIMESTAMP...) - We need these rli coordinates : - - relay log name and position of the beginning of the group we currently are - executing. Needed to know where we have to restart when replication has - stopped in the middle of a group (which has been rolled back by the slave). - - relay log name and position just after the event we have just - executed. This event is part of the current group. - Formerly we only had the immediately above coordinates, plus a 'pending' - variable, but this dealt wrong with the case of a transaction starting on a - relay log and finishing (commiting) on another relay log. Case which can - happen when, for example, the relay log gets rotated because of - max_binlog_size. - */ - char group_relay_log_name[FN_REFLEN]; - ulonglong group_relay_log_pos; - char event_relay_log_name[FN_REFLEN]; - ulonglong event_relay_log_pos; - ulonglong future_event_relay_log_pos; - - /* - Original log name and position of the group we're currently executing - (whose coordinates are group_relay_log_name/pos in the relay log) - in the master's binlog. These concern the *group*, because in the master's - binlog the log_pos that comes with each event is the position of the - beginning of the group. - */ - char group_master_log_name[FN_REFLEN]; - volatile my_off_t group_master_log_pos; - - /* - Handling of the relay_log_space_limit optional constraint. - ignore_log_space_limit is used to resolve a deadlock between I/O and SQL - threads, the SQL thread sets it to unblock the I/O thread and make it - temporarily forget about the constraint. - */ - ulonglong log_space_limit,log_space_total; - bool ignore_log_space_limit; - - /* - When it commits, InnoDB internally stores the master log position it has - processed so far; the position to store is the one of the end of the - committing event (the COMMIT query event, or the event if in autocommit - mode). - */ -#if MYSQL_VERSION_ID < 40100 - ulonglong future_master_log_pos; -#else - ulonglong future_group_master_log_pos; -#endif - - time_t last_master_timestamp; - - /* - Needed for problems when slave stops and we want to restart it - skipping one or more events in the master log that have caused - errors, and have been manually applied by DBA already. - */ - volatile uint32 slave_skip_counter; - volatile ulong abort_pos_wait; /* Incremented on change master */ - volatile ulong slave_run_id; /* Incremented on slave start */ - pthread_mutex_t log_space_lock; - pthread_cond_t log_space_cond; - THD * sql_thd; - int last_slave_errno; -#ifndef DBUG_OFF - int events_till_abort; -#endif - char last_slave_error[MAX_SLAVE_ERRMSG]; - - /* if not set, the value of other members of the structure are undefined */ - bool inited; - volatile bool abort_slave; - volatile uint slave_running; - - /* - Condition and its parameters from START SLAVE UNTIL clause. - - UNTIL condition is tested with is_until_satisfied() method that is - called by exec_relay_log_event(). is_until_satisfied() caches the result - of the comparison of log names because log names don't change very often; - this cache is invalidated by parts of code which change log names with - notify_*_log_name_updated() methods. (They need to be called only if SQL - thread is running). - */ - - enum {UNTIL_NONE= 0, UNTIL_MASTER_POS, UNTIL_RELAY_POS} until_condition; - char until_log_name[FN_REFLEN]; - ulonglong until_log_pos; - /* extension extracted from log_name and converted to int */ - ulong until_log_name_extension; - /* - Cached result of comparison of until_log_name and current log name - -2 means unitialised, -1,0,1 are comarison results - */ - enum - { - UNTIL_LOG_NAMES_CMP_UNKNOWN= -2, UNTIL_LOG_NAMES_CMP_LESS= -1, - UNTIL_LOG_NAMES_CMP_EQUAL= 0, UNTIL_LOG_NAMES_CMP_GREATER= 1 - } until_log_names_cmp_result; - - char cached_charset[6]; - /* - trans_retries varies between 0 to slave_transaction_retries and counts how - many times the slave has retried the present transaction; gets reset to 0 - when the transaction finally succeeds. retried_trans is a cumulative - counter: how many times the slave has retried a transaction (any) since - slave started. - */ - ulong trans_retries, retried_trans; - - /* - If the end of the hot relay log is made of master's events ignored by the - slave I/O thread, these two keep track of the coords (in the master's - binlog) of the last of these events seen by the slave I/O thread. If not, - ign_master_log_name_end[0] == 0. - As they are like a Rotate event read/written from/to the relay log, they - are both protected by rli->relay_log.LOCK_log. - */ - char ign_master_log_name_end[FN_REFLEN]; - ulonglong ign_master_log_pos_end; - - st_relay_log_info(); - ~st_relay_log_info(); - - /* - Invalidate cached until_log_name and group_relay_log_name comparison - result. Should be called after any update of group_realy_log_name if - there chances that sql_thread is running. - */ - inline void notify_group_relay_log_name_update() - { - if (until_condition==UNTIL_RELAY_POS) - until_log_names_cmp_result= UNTIL_LOG_NAMES_CMP_UNKNOWN; - } - - /* - The same as previous but for group_master_log_name. - */ - inline void notify_group_master_log_name_update() - { - if (until_condition==UNTIL_MASTER_POS) - until_log_names_cmp_result= UNTIL_LOG_NAMES_CMP_UNKNOWN; - } - - inline void inc_event_relay_log_pos() - { - event_relay_log_pos= future_event_relay_log_pos; - } - - void inc_group_relay_log_pos(ulonglong log_pos, - bool skip_lock=0); - - int wait_for_pos(THD* thd, String* log_name, longlong log_pos, - longlong timeout); - void close_temporary_tables(); - - /* Check if UNTIL condition is satisfied. See slave.cc for more. */ - bool is_until_satisfied(); - inline ulonglong until_pos() - { - return ((until_condition == UNTIL_MASTER_POS) ? group_master_log_pos : - group_relay_log_pos); - } - /* - Last charset (6 bytes) seen by slave SQL thread is cached here; it helps - the thread save 3 get_charset() per Query_log_event if the charset is not - changing from event to event (common situation). - When the 6 bytes are equal to 0 is used to mean "cache is invalidated". - */ - void cached_charset_invalidate(); - bool cached_charset_compare(char *charset); -} RELAY_LOG_INFO; - - -Log_event* next_event(RELAY_LOG_INFO* rli); +static Log_event* next_event(RELAY_LOG_INFO* rli); /***************************************************************************** @@ -427,7 +171,7 @@ typedef struct st_master_info uint port; uint connect_retry; #ifndef DBUG_OFF - int events_till_abort; + int events_till_disconnect; #endif bool inited; volatile bool abort_slave; @@ -474,17 +218,11 @@ typedef struct st_master_info int queue_event(MASTER_INFO* mi,const char* buf,ulong event_len); -#define MAX_SLAVE_ERRMSG 1024 - #define RPL_LOG_NAME (rli->group_master_log_name[0] ? rli->group_master_log_name :\ "FIRST") #define IO_RPL_LOG_NAME (mi->master_log_name[0] ? mi->master_log_name :\ "FIRST") -/* masks for start/stop operations on io and sql slave threads */ -#define SLAVE_IO 1 -#define SLAVE_SQL 2 - /* If the following is set, if first gives an error, second will be tried. Otherwise, if first fails, we fail. @@ -533,7 +271,8 @@ bool show_binlog_info(THD* thd); const char *print_slave_db_safe(const char *db); int check_expected_error(THD* thd, RELAY_LOG_INFO* rli, int error_code); void skip_load_data_infile(NET* net); -void slave_print_error(RELAY_LOG_INFO* rli, int err_code, const char* msg, ...); +void slave_print_msg(enum loglevel level, RELAY_LOG_INFO* rli, + int err_code, const char* msg, ...); void end_slave(); /* clean up */ void init_master_info_with_options(MASTER_INFO* mi); @@ -579,8 +318,12 @@ extern my_string master_ssl_ca, master_ssl_capath, master_ssl_cert, extern I_List<THD> threads; -#endif -#else +#endif /* HAVE_REPLICATION */ + +/* masks for start/stop operations on io and sql slave threads */ #define SLAVE_IO 1 #define SLAVE_SQL 2 -#endif /* HAVE_REPLICATION */ + +#endif + + diff --git a/sql/sp.cc b/sql/sp.cc index 81164131910..c85c1f2afef 100644 --- a/sql/sp.cc +++ b/sql/sp.cc @@ -585,14 +585,14 @@ db_create_routine(THD *thd, int type, sp_head *sp) } ret= SP_OK; - if (table->file->write_row(table->record[0])) + if (table->file->ha_write_row(table->record[0])) ret= SP_WRITE_ROW_FAILED; else if (mysql_bin_log.is_open()) { thd->clear_error(); /* Such a statement can always go directly to binlog, no trans cache */ - Query_log_event qinfo(thd, thd->query, thd->query_length, 0, FALSE); - mysql_bin_log.write(&qinfo); + thd->binlog_query(THD::MYSQL_QUERY_TYPE, + thd->query, thd->query_length, FALSE, FALSE); } } @@ -618,7 +618,7 @@ db_drop_routine(THD *thd, int type, sp_name *name) DBUG_RETURN(SP_OPEN_TABLE_FAILED); if ((ret= db_find_routine_aux(thd, type, name, table)) == SP_OK) { - if (table->file->delete_row(table->record[0])) + if (table->file->ha_delete_row(table->record[0])) ret= SP_DELETE_ROW_FAILED; } close_thread_tables(thd); @@ -653,7 +653,7 @@ db_update_routine(THD *thd, int type, sp_name *name, st_sp_chistics *chistics) table->field[MYSQL_PROC_FIELD_COMMENT]->store(chistics->comment.str, chistics->comment.length, system_charset_info); - if ((table->file->update_row(table->record[1],table->record[0]))) + if ((table->file->ha_update_row(table->record[1],table->record[0]))) ret= SP_WRITE_ROW_FAILED; } close_thread_tables(thd); @@ -873,7 +873,7 @@ sp_drop_db_routines(THD *thd, char *db) do { - if (! table->file->delete_row(table->record[0])) + if (! table->file->ha_delete_row(table->record[0])) deleted= TRUE; /* We deleted something */ else { diff --git a/sql/sp_head.cc b/sql/sp_head.cc index 4a252fc4d86..63d1388473e 100644 --- a/sql/sp_head.cc +++ b/sql/sp_head.cc @@ -695,6 +695,9 @@ int cmp_splocal_locations(Item_splocal * const *a, Item_splocal * const *b) /* StoredRoutinesBinlogging + This paragraph applies only to statement-based binlogging. Row-based + binlogging does not need anything special like this. + Top-down overview: 1. Statements @@ -1258,56 +1261,62 @@ sp_head::execute_function(THD *thd, Item **argp, uint argcount, thd->spcont= nctx; - binlog_save_options= thd->options; - need_binlog_call= mysql_bin_log.is_open() && (thd->options & OPTION_BIN_LOG); + /* + If row-based binlogging, we don't need to binlog the function's call, let + each substatement be binlogged its way. + */ + need_binlog_call= mysql_bin_log.is_open() && + (thd->options & OPTION_BIN_LOG) && !binlog_row_based; if (need_binlog_call) { reset_dynamic(&thd->user_var_events); mysql_bin_log.start_union_events(thd); + binlog_save_options= thd->options; + thd->options&= ~OPTION_BIN_LOG; } - - thd->options&= ~OPTION_BIN_LOG; + err_status= execute(thd); - thd->options= binlog_save_options; - - if (need_binlog_call) - mysql_bin_log.stop_union_events(thd); - if (need_binlog_call && thd->binlog_evt_union.unioned_events) + if (need_binlog_call) { - char buf[256]; - String bufstr(buf, sizeof(buf), &my_charset_bin); - bufstr.length(0); - bufstr.append(STRING_WITH_LEN("DO ")); - append_identifier(thd, &bufstr, m_name.str, m_name.length); - bufstr.append('('); - for (uint i=0; i < argcount; i++) - { - String str_value_holder; - String *str_value; - - if (i) - bufstr.append(','); - - str_value= sp_get_item_value(param_values[i], &str_value_holder); - - if (str_value) - bufstr.append(*str_value); - else - bufstr.append(STRING_WITH_LEN("NULL")); - } - bufstr.append(')'); - - Query_log_event qinfo(thd, bufstr.ptr(), bufstr.length(), - thd->binlog_evt_union.unioned_events_trans, FALSE); - if (mysql_bin_log.write(&qinfo) && - thd->binlog_evt_union.unioned_events_trans) + mysql_bin_log.stop_union_events(thd); + thd->options= binlog_save_options; + if (thd->binlog_evt_union.unioned_events) { - push_warning(thd, MYSQL_ERROR::WARN_LEVEL_WARN, ER_UNKNOWN_ERROR, - "Invoked ROUTINE modified a transactional table but MySQL " - "failed to reflect this change in the binary log"); + char buf[256]; + String bufstr(buf, sizeof(buf), &my_charset_bin); + bufstr.length(0); + bufstr.append(STRING_WITH_LEN("DO ")); + append_identifier(thd, &bufstr, m_name.str, m_name.length); + bufstr.append('('); + for (uint i=0; i < argcount; i++) + { + String str_value_holder; + String *str_value; + + if (i) + bufstr.append(','); + + str_value= sp_get_item_value(param_values[i], &str_value_holder); + + if (str_value) + bufstr.append(*str_value); + else + bufstr.append(STRING_WITH_LEN("NULL")); + } + bufstr.append(')'); + + Query_log_event qinfo(thd, bufstr.ptr(), bufstr.length(), + thd->binlog_evt_union.unioned_events_trans, FALSE); + if (mysql_bin_log.write(&qinfo) && + thd->binlog_evt_union.unioned_events_trans) + { + push_warning(thd, MYSQL_ERROR::WARN_LEVEL_WARN, ER_UNKNOWN_ERROR, + "Invoked ROUTINE modified a transactional table but MySQL " + "failed to reflect this change in the binary log"); + } + reset_dynamic(&thd->user_var_events); } - reset_dynamic(&thd->user_var_events); } if (m_type == TYPE_ENUM_FUNCTION && !err_status) diff --git a/sql/sql_acl.cc b/sql/sql_acl.cc index 5ee7bf8fd58..42e99bbd1bc 100644 --- a/sql/sql_acl.cc +++ b/sql/sql_acl.cc @@ -1468,8 +1468,7 @@ bool change_password(THD *thd, const char *host, const char *user, acl_user->host.hostname ? acl_user->host.hostname : "", new_password)); thd->clear_error(); - Query_log_event qinfo(thd, buff, query_length, 0, FALSE); - mysql_bin_log.write(&qinfo); + thd->binlog_query(THD::MYSQL_QUERY_TYPE, buff, query_length, FALSE, FALSE); } end: close_thread_tables(thd); @@ -1654,7 +1653,7 @@ static bool update_user_table(THD *thd, TABLE *table, } store_record(table,record[1]); table->field[2]->store(new_password, new_password_len, system_charset_info); - if ((error=table->file->update_row(table->record[1],table->record[0]))) + if ((error=table->file->ha_update_row(table->record[1],table->record[0]))) { table->file->print_error(error,MYF(0)); /* purecov: deadcode */ DBUG_RETURN(1); @@ -1871,14 +1870,14 @@ static int replace_user_table(THD *thd, TABLE *table, const LEX_USER &combo, */ table->file->ha_retrieve_all_cols(); if (cmp_record(table,record[1]) && - (error=table->file->update_row(table->record[1],table->record[0]))) + (error=table->file->ha_update_row(table->record[1],table->record[0]))) { // This should never happen table->file->print_error(error,MYF(0)); /* purecov: deadcode */ error= -1; /* purecov: deadcode */ goto end; /* purecov: deadcode */ } } - else if ((error=table->file->write_row(table->record[0]))) // insert + else if ((error=table->file->ha_write_row(table->record[0]))) // insert { // This should never happen if (error && error != HA_ERR_FOUND_DUPP_KEY && error != HA_ERR_FOUND_DUPP_UNIQUE) /* purecov: inspected */ @@ -1988,16 +1987,17 @@ static int replace_db_table(TABLE *table, const char *db, if (rights) { table->file->ha_retrieve_all_cols(); - if ((error=table->file->update_row(table->record[1],table->record[0]))) + if ((error=table->file->ha_update_row(table->record[1], + table->record[0]))) goto table_error; /* purecov: deadcode */ } else /* must have been a revoke of all privileges */ { - if ((error = table->file->delete_row(table->record[1]))) + if ((error = table->file->ha_delete_row(table->record[1]))) goto table_error; /* purecov: deadcode */ } } - else if (rights && (error=table->file->write_row(table->record[0]))) + else if (rights && (error=table->file->ha_write_row(table->record[0]))) { if (error && error != HA_ERR_FOUND_DUPP_KEY) /* purecov: inspected */ goto table_error; /* purecov: deadcode */ @@ -2365,9 +2365,9 @@ static int replace_column_table(GRANT_TABLE *g_t, { GRANT_COLUMN *grant_column; if (privileges) - error=table->file->update_row(table->record[1],table->record[0]); + error=table->file->ha_update_row(table->record[1],table->record[0]); else - error=table->file->delete_row(table->record[1]); + error=table->file->ha_delete_row(table->record[1]); if (error) { table->file->print_error(error,MYF(0)); /* purecov: inspected */ @@ -2382,7 +2382,7 @@ static int replace_column_table(GRANT_TABLE *g_t, else // new grant { GRANT_COLUMN *grant_column; - if ((error=table->file->write_row(table->record[0]))) + if ((error=table->file->ha_write_row(table->record[0]))) { table->file->print_error(error,MYF(0)); /* purecov: inspected */ result= -1; /* purecov: inspected */ @@ -2434,8 +2434,8 @@ static int replace_column_table(GRANT_TABLE *g_t, if (privileges) { int tmp_error; - if ((tmp_error=table->file->update_row(table->record[1], - table->record[0]))) + if ((tmp_error=table->file->ha_update_row(table->record[1], + table->record[0]))) { /* purecov: deadcode */ table->file->print_error(tmp_error,MYF(0)); /* purecov: deadcode */ result= -1; /* purecov: deadcode */ @@ -2447,7 +2447,7 @@ static int replace_column_table(GRANT_TABLE *g_t, else { int tmp_error; - if ((tmp_error = table->file->delete_row(table->record[1]))) + if ((tmp_error = table->file->ha_delete_row(table->record[1]))) { /* purecov: deadcode */ table->file->print_error(tmp_error,MYF(0)); /* purecov: deadcode */ result= -1; /* purecov: deadcode */ @@ -2555,15 +2555,15 @@ static int replace_table_table(THD *thd, GRANT_TABLE *grant_table, { if (store_table_rights || store_col_rights) { - if ((error=table->file->update_row(table->record[1],table->record[0]))) + if ((error=table->file->ha_update_row(table->record[1],table->record[0]))) goto table_error; /* purecov: deadcode */ } - else if ((error = table->file->delete_row(table->record[1]))) + else if ((error = table->file->ha_delete_row(table->record[1]))) goto table_error; /* purecov: deadcode */ } else { - error=table->file->write_row(table->record[0]); + error=table->file->ha_write_row(table->record[0]); if (error && error != HA_ERR_FOUND_DUPP_KEY) goto table_error; /* purecov: deadcode */ } @@ -2672,15 +2672,15 @@ static int replace_routine_table(THD *thd, GRANT_NAME *grant_name, { if (store_proc_rights) { - if ((error=table->file->update_row(table->record[1],table->record[0]))) + if ((error=table->file->ha_update_row(table->record[1],table->record[0]))) goto table_error; } - else if ((error= table->file->delete_row(table->record[1]))) + else if ((error= table->file->ha_delete_row(table->record[1]))) goto table_error; } else { - error=table->file->write_row(table->record[0]); + error=table->file->ha_write_row(table->record[0]); if (error && error != HA_ERR_FOUND_DUPP_KEY) goto table_error; } @@ -3119,6 +3119,16 @@ bool mysql_routine_grant(THD *thd, TABLE_LIST *table_list, bool is_proc, } grant_option=TRUE; thd->mem_root= old_root; + /* + This flush is here only becuase there is code that writes rows to + system tables after executing a binlog_query(). + + TODO: Ensure that no writes are executed after a binlog_query() by + moving the writes to before calling binlog_query(). Then remove + this line (and add an assert inside send_ok() that checks that + everything is in a consistent state). + */ + thd->binlog_flush_pending_rows_event(true); rw_unlock(&LOCK_grant); if (!result && !no_error) send_ok(thd); @@ -4670,13 +4680,13 @@ static int modify_grant_table(TABLE *table, Field *host_field, system_charset_info); user_field->store(user_to->user.str, user_to->user.length, system_charset_info); - if ((error= table->file->update_row(table->record[1], table->record[0]))) + if ((error= table->file->ha_update_row(table->record[1], table->record[0]))) table->file->print_error(error, MYF(0)); } else { /* delete */ - if ((error=table->file->delete_row(table->record[0]))) + if ((error=table->file->ha_delete_row(table->record[0]))) table->file->print_error(error, MYF(0)); } @@ -5683,7 +5693,7 @@ void update_schema_privilege(TABLE *table, char *buff, const char* db, table->field[i++]->store(column, col_length, cs); table->field[i++]->store(priv, priv_length, cs); table->field[i]->store(is_grantable, strlen(is_grantable), cs); - table->file->write_row(table->record[0]); + table->file->ha_write_row(table->record[0]); } diff --git a/sql/sql_base.cc b/sql/sql_base.cc index 8f0f0d779e4..b9cd1afcd26 100644 --- a/sql/sql_base.cc +++ b/sql/sql_base.cc @@ -1029,6 +1029,19 @@ void close_thread_tables(THD *thd, bool lock_in_use, bool skip_derived) /* Fallthrough */ } + /* + For RBR: before calling close_thread_tables(), storage engines + should autocommit. Hence if there is a a pending event, it belongs + to a non-transactional engine, which writes directly to the table, + and should therefore be flushed before unlocking and closing the + tables. The test above for locked tables will not be triggered + since RBR locks and unlocks tables on a per-event basis. + + TODO (WL#3023): Change the semantics so that RBR does not lock and + unlock tables on a per-event basis. + */ + thd->binlog_flush_pending_rows_event(true); + if (thd->lock) { mysql_unlock_tables(thd, thd->lock); @@ -1171,7 +1184,8 @@ void close_temporary_tables(THD *thd) next=table->next; close_temporary(table, 1, 1); } - if (query && found_user_tables && mysql_bin_log.is_open()) + if (query && found_user_tables && mysql_bin_log.is_open() && + !binlog_row_based) // CREATE TEMP TABLE not binlogged if row-based { /* The -1 is to remove last ',' */ thd->clear_error(); @@ -2038,6 +2052,8 @@ static bool reopen_table(TABLE *table) tmp.keys_in_use_for_query= tmp.s->keys_in_use; tmp.used_keys= tmp.s->keys_for_keyread; + tmp.s->table_map_id= table->s->table_map_id; + /* Get state */ tmp.in_use= thd; tmp.reginfo.lock_type=table->reginfo.lock_type; @@ -2343,6 +2359,48 @@ void abort_locked_tables(THD *thd,const char *db, const char *table_name) /* + Function to assign a new table map id to a table. + + PARAMETERS + + table - Pointer to table structure + + PRE-CONDITION(S) + + table is non-NULL + The LOCK_open mutex is locked + + POST-CONDITION(S) + + table->s->table_map_id is given a value that with a high certainty + is not used by any other table. + + table->s->table_map_id is not ULONG_MAX. + */ +static void assign_new_table_id(TABLE *table) +{ + static ulong last_table_id= ULONG_MAX; + + DBUG_ENTER("assign_new_table_id(TABLE*)"); + + /* Preconditions */ + DBUG_ASSERT(table != NULL); + safe_mutex_assert_owner(&LOCK_open); + + ulong tid= ++last_table_id; /* get next id */ + /* There is one reserved number that cannot be used. */ + if (unlikely(tid == ULONG_MAX)) + tid= ++last_table_id; + table->s->table_map_id= tid; + DBUG_PRINT("info", ("table_id=%lu", tid)); + + /* Post conditions */ + DBUG_ASSERT(table->s->table_map_id != ULONG_MAX); + + DBUG_VOID_RETURN; +} + +/* Load a table definition from file and open unireg table SYNOPSIS @@ -2490,7 +2548,21 @@ retry: goto err; break; } - + + /* + We assign a new table id under the protection of the LOCK_open + mutex. We assign a new table id here instead of inside openfrm() + since that function can be used without acquiring any lock (e.g., + inside ha_create_table()). Insted of creatint a new mutex and + using it for the sole purpose of serializing accesses to a static + variable, we assign the table id here. + + CAVEAT. This means that the table cannot be used for + binlogging/replication purposes, unless open_table() has been called + directly or indirectly. + */ + assign_new_table_id(entry); + if (Table_triggers_list::check_n_load(thd, share->db.str, share->table_name.str, entry, 0)) { @@ -2511,10 +2583,11 @@ retry: uint query_buf_size= 20 + share->db.length + share->table_name.length +1; if ((query= (char*) my_malloc(query_buf_size,MYF(MY_WME)))) { + /* this DELETE FROM is needed even with row-based binlogging */ end = strxmov(strmov(query, "DELETE FROM `"), share->db.str,"`.`",share->table_name.str,"`", NullS); - Query_log_event qinfo(thd, query, (ulong)(end-query), 0, FALSE); - mysql_bin_log.write(&qinfo); + thd->binlog_query(THD::STMT_QUERY_TYPE, + query, (ulong)(end-query), FALSE, FALSE); my_free(query, MYF(0)); } else diff --git a/sql/sql_binlog.cc b/sql/sql_binlog.cc new file mode 100644 index 00000000000..cc0e9714d85 --- /dev/null +++ b/sql/sql_binlog.cc @@ -0,0 +1,135 @@ +/* Copyright (C) 2005 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#include "mysql_priv.h" +#include "base64.h" + +/* + Execute a BINLOG statement + + TODO: This currently assumes a MySQL 5.x binlog. + When we'll have binlog with a different format, to execute the + BINLOG command properly the server will need to know which format + the BINLOG command's event is in. mysqlbinlog should then send + the Format_description_log_event of the binlog it reads and the + server thread should cache this format into + rli->description_event_for_exec. +*/ + +void mysql_client_binlog_statement(THD* thd) +{ + DBUG_PRINT("info",("binlog base64: '%*s'", + (thd->lex->comment.length < 2048 ? + thd->lex->comment.length : 2048), + thd->lex->comment.str)); + + /* + Temporarily turn off send_ok, since different events handle this + differently + */ + my_bool nsok= thd->net.no_send_ok; + thd->net.no_send_ok= TRUE; + + const my_size_t coded_len= thd->lex->comment.length + 1; + const my_size_t event_len= base64_needed_decoded_length(coded_len); + DBUG_ASSERT(coded_len > 0); + + /* + Allocation + */ + if (!thd->rli_fake) + thd->rli_fake= new RELAY_LOG_INFO; + + const Format_description_log_event *desc= + new Format_description_log_event(4); + + const char *error= 0; + char *buf= (char *) my_malloc(event_len, MYF(MY_WME)); + Log_event *ev; + int res; + + /* + Out of memory check + */ + if (!(thd->rli_fake && desc && buf)) + { + my_error(ER_OUTOFMEMORY, MYF(0), 1); /* needed 1 bytes */ + goto end; + } + + thd->rli_fake->sql_thd= thd; + thd->rli_fake->no_storage= TRUE; + + res= base64_decode(thd->lex->comment.str, coded_len, buf); + + DBUG_PRINT("info",("binlog base64 decoded_len=%d, event_len=%d\n", + res, uint4korr(buf + EVENT_LEN_OFFSET))); + /* + Note that 'res' is the correct event length, 'event_len' was + calculated based on the base64-string that possibly contained + extra spaces, so it can be longer than the real event. + */ + if (res < EVENT_LEN_OFFSET + || (uint) res != uint4korr(buf+EVENT_LEN_OFFSET)) + { + my_error(ER_SYNTAX_ERROR, MYF(0)); + goto end; + } + + ev= Log_event::read_log_event(buf, res, &error, desc); + + DBUG_PRINT("info",("binlog base64 err=%s", error)); + if (!ev) + { + /* + This could actually be an out-of-memory, but it is more + likely causes by a bad statement + */ + my_error(ER_SYNTAX_ERROR, MYF(0)); + goto end; + } + + DBUG_PRINT("info",("ev->get_type_code()=%d", ev->get_type_code())); + DBUG_PRINT("info",("buf+EVENT_TYPE_OFFSET=%d", buf+EVENT_TYPE_OFFSET)); + + ev->thd= thd; + if (ev->exec_event(thd->rli_fake)) + { + my_error(ER_UNKNOWN_ERROR, MYF(0), "Error executing BINLOG statement"); + goto end; + } + + /* + Restore setting of no_send_ok + */ + thd->net.no_send_ok= nsok; + + DBUG_PRINT("info",("binlog base64 execution finished successfully")); + send_ok(thd); + +end: + /* + Restore setting of no_send_ok + */ + thd->net.no_send_ok= nsok; + + if (ev) + delete ev; + if (desc) + delete desc; + if (buf) + my_free(buf, MYF(0)); +} diff --git a/sql/sql_class.cc b/sql/sql_class.cc index a28324c5e28..08d89228a72 100644 --- a/sql/sql_class.cc +++ b/sql/sql_class.cc @@ -27,6 +27,8 @@ #endif #include "mysql_priv.h" +#include <my_bitmap.h> +#include "log_event.h" #include <m_ctype.h> #include <sys/stat.h> #include <thr_alarm.h> @@ -174,7 +176,7 @@ Open_tables_state::Open_tables_state(ulong version_arg) THD::THD() :Statement(CONVENTIONAL_EXECUTION, 0, ALLOC_ROOT_MIN_BLOCK_SIZE, 0), - Open_tables_state(refresh_version), + Open_tables_state(refresh_version), rli_fake(0), lock_id(&main_lock_id), user_time(0), in_sub_stmt(0), global_read_lock(0), is_fatal_error(0), rand_used(0), time_zone_used(0), @@ -227,6 +229,9 @@ THD::THD() ull=0; system_thread= cleanup_done= abort_on_warning= no_warnings_for_error= 0; peer_port= 0; // For SHOW PROCESSLIST +#ifdef HAVE_ROW_BASED_REPLICATION + transaction.m_pending_rows_event= 0; +#endif #ifdef __WIN__ real_id = 0; #endif @@ -440,6 +445,11 @@ THD::~THD() #ifndef DBUG_OFF dbug_sentry= THD_SENTRY_GONE; #endif +#ifndef EMBEDDED_LIBRARY + if (rli_fake) + delete rli_fake; +#endif + DBUG_VOID_RETURN; } @@ -1959,7 +1969,8 @@ void THD::reset_sub_statement_state(Sub_statement_state *backup, backup->client_capabilities= client_capabilities; backup->savepoints= transaction.savepoints; - if (!lex->requires_prelocking() || is_update_query(lex->sql_command)) + if ((!lex->requires_prelocking() || is_update_query(lex->sql_command)) && + !binlog_row_based) options&= ~OPTION_BIN_LOG; /* Disable result sets */ client_capabilities &= ~CLIENT_MULTI_RESULTS; @@ -2101,3 +2112,439 @@ void xid_cache_delete(XID_STATE *xid_state) pthread_mutex_unlock(&LOCK_xid_cache); } +/* + Implementation of interface to write rows to the binary log through the + thread. The thread is responsible for writing the rows it has + inserted/updated/deleted. +*/ + +#ifndef MYSQL_CLIENT +#ifdef HAVE_ROW_BASED_REPLICATION + +/* + Template member function for ensuring that there is an rows log + event of the apropriate type before proceeding. + + PRE CONDITION: + - Events of type 'RowEventT' have the type code 'type_code'. + + POST CONDITION: + If a non-NULL pointer is returned, the pending event for thread 'thd' will + be an event of type 'RowEventT' (which have the type code 'type_code') + will either empty or have enough space to hold 'needed' bytes. In + addition, the columns bitmap will be correct for the row, meaning that + the pending event will be flushed if the columns in the event differ from + the columns suppled to the function. + + RETURNS + If no error, a non-NULL pending event (either one which already existed or + the newly created one). + If error, NULL. + */ + +template <class RowsEventT> Rows_log_event* +THD::binlog_prepare_pending_rows_event(TABLE* table, uint32 serv_id, + MY_BITMAP const* cols, + my_size_t colcnt, + my_size_t needed, + bool is_transactional) +{ + /* Pre-conditions */ + DBUG_ASSERT(table->s->table_map_id != ULONG_MAX); + + /* Fetch the type code for the RowsEventT template parameter */ + int const type_code= RowsEventT::TYPE_CODE; + + /* + There is no good place to set up the transactional data, so we + have to do it here. + */ + if (binlog_setup_trx_data()) + return NULL; + + Rows_log_event* pending= binlog_get_pending_rows_event(); + + if (unlikely(pending && !pending->is_valid())) + return NULL; + + /* + Check if the current event is non-NULL and a write-rows + event. Also check if the table provided is mapped: if it is not, + then we have switched to writing to a new table. + If there is no pending event, we need to create one. If there is a pending + event, but it's not about the same table id, or not of the same type + (between Write, Update and Delete), or not the same affected columns, or + going to be too big, flush this event to disk and create a new pending + event. + */ + if (!pending || + pending->server_id != serv_id || + pending->get_table_id() != table->s->table_map_id || + pending->get_type_code() != type_code || + pending->get_data_size() + needed > opt_binlog_rows_event_max_size || + pending->get_width() != colcnt || + !bitmap_cmp(pending->get_cols(), cols)) + { + /* Create a new RowsEventT... */ + Rows_log_event* const + ev= new RowsEventT(this, table, table->s->table_map_id, cols, + is_transactional); + if (unlikely(!ev)) + return NULL; + ev->server_id= serv_id; // I don't like this, it's too easy to forget. + /* + flush the pending event and replace it with the newly created + event... + */ + if (unlikely(mysql_bin_log.flush_and_set_pending_rows_event(this, ev))) + { + delete ev; + return NULL; + } + + return ev; /* This is the new pending event */ + } + return pending; /* This is the current pending event */ +} + +/* + Instansiate the versions we need, we have -fno-implicit-template as + compiling option. +*/ +template Rows_log_event* +THD::binlog_prepare_pending_rows_event<Write_rows_log_event> +(TABLE*, uint32, MY_BITMAP const*, my_size_t colcnt, size_t, bool); + +template Rows_log_event* +THD::binlog_prepare_pending_rows_event<Delete_rows_log_event> +(TABLE*, uint32, MY_BITMAP const*, my_size_t colcnt, size_t, bool); + +template Rows_log_event* +THD::binlog_prepare_pending_rows_event<Update_rows_log_event> +(TABLE*, uint32, MY_BITMAP const*, my_size_t colcnt, size_t, bool); + +static char const* +field_type_name(enum_field_types type) +{ + switch (type) + { + case MYSQL_TYPE_DECIMAL: + return "MYSQL_TYPE_DECIMAL"; + case MYSQL_TYPE_TINY: + return "MYSQL_TYPE_TINY"; + case MYSQL_TYPE_SHORT: + return "MYSQL_TYPE_SHORT"; + case MYSQL_TYPE_LONG: + return "MYSQL_TYPE_LONG"; + case MYSQL_TYPE_FLOAT: + return "MYSQL_TYPE_FLOAT"; + case MYSQL_TYPE_DOUBLE: + return "MYSQL_TYPE_DOUBLE"; + case MYSQL_TYPE_NULL: + return "MYSQL_TYPE_NULL"; + case MYSQL_TYPE_TIMESTAMP: + return "MYSQL_TYPE_TIMESTAMP"; + case MYSQL_TYPE_LONGLONG: + return "MYSQL_TYPE_LONGLONG"; + case MYSQL_TYPE_INT24: + return "MYSQL_TYPE_INT24"; + case MYSQL_TYPE_DATE: + return "MYSQL_TYPE_DATE"; + case MYSQL_TYPE_TIME: + return "MYSQL_TYPE_TIME"; + case MYSQL_TYPE_DATETIME: + return "MYSQL_TYPE_DATETIME"; + case MYSQL_TYPE_YEAR: + return "MYSQL_TYPE_YEAR"; + case MYSQL_TYPE_NEWDATE: + return "MYSQL_TYPE_NEWDATE"; + case MYSQL_TYPE_VARCHAR: + return "MYSQL_TYPE_VARCHAR"; + case MYSQL_TYPE_BIT: + return "MYSQL_TYPE_BIT"; + case MYSQL_TYPE_NEWDECIMAL: + return "MYSQL_TYPE_NEWDECIMAL"; + case MYSQL_TYPE_ENUM: + return "MYSQL_TYPE_ENUM"; + case MYSQL_TYPE_SET: + return "MYSQL_TYPE_SET"; + case MYSQL_TYPE_TINY_BLOB: + return "MYSQL_TYPE_TINY_BLOB"; + case MYSQL_TYPE_MEDIUM_BLOB: + return "MYSQL_TYPE_MEDIUM_BLOB"; + case MYSQL_TYPE_LONG_BLOB: + return "MYSQL_TYPE_LONG_BLOB"; + case MYSQL_TYPE_BLOB: + return "MYSQL_TYPE_BLOB"; + case MYSQL_TYPE_VAR_STRING: + return "MYSQL_TYPE_VAR_STRING"; + case MYSQL_TYPE_STRING: + return "MYSQL_TYPE_STRING"; + case MYSQL_TYPE_GEOMETRY: + return "MYSQL_TYPE_GEOMETRY"; + } + return "Unknown"; +} + +my_size_t THD::max_row_length_blob(TABLE *table, const byte *data) const +{ + my_size_t length= 0; + TABLE_SHARE *table_s= table->s; + uint* const beg= table_s->blob_field; + uint* const end= beg + table_s->blob_fields; + + for (uint *ptr= beg ; ptr != end ; ++ptr) + { + Field_blob* const blob= (Field_blob*) table->field[*ptr]; + length+= blob->get_length(data + blob->offset()) + 2; + } + + return length; +} + +my_size_t THD::pack_row(TABLE *table, MY_BITMAP const* cols, byte *row_data, + const byte *record) const +{ + Field **p_field= table->field, *field= *p_field; + int n_null_bytes= table->s->null_bytes; + my_ptrdiff_t const offset= record - (byte*) table->record[0]; + + memcpy(row_data, record, n_null_bytes); + byte *ptr= row_data+n_null_bytes; + + for (int i= 0 ; field ; i++, p_field++, field= *p_field) + { + if (bitmap_is_set(cols,i)) + ptr= field->pack(ptr, field->ptr + offset); + } + + /* + my_ptrdiff_t is signed, size_t is unsigned. Assert that the + conversion will work correctly. + */ + DBUG_ASSERT(ptr - row_data >= 0); + return (static_cast<size_t>(ptr - row_data)); +} + +int THD::binlog_write_row(TABLE* table, bool is_trans, + MY_BITMAP const* cols, my_size_t colcnt, + byte const *record) +{ + DBUG_ASSERT(binlog_row_based && mysql_bin_log.is_open()); + + /* + Pack records into format for transfer. We are allocating more + memory than needed, but that doesn't matter. + */ + bool error= 0; + byte *row_data= table->write_row_record; + my_size_t const max_len= max_row_length(table, record); + + /* + * Allocate room for a row (if needed) + */ + if (!row_data) + { + if (!table->s->blob_fields) + { + /* multiply max_len by 2 so it can be used for update_row as well */ + table->write_row_record= alloc_root(&table->mem_root, 2*max_len); + if (!table->write_row_record) + return HA_ERR_OUT_OF_MEM; + row_data= table->write_row_record; + } + else if (unlikely(!(row_data= my_malloc(max_len, MYF(MY_WME))))) + return HA_ERR_OUT_OF_MEM; + } + my_size_t const len= pack_row(table, cols, row_data, record); + + Rows_log_event* const + ev= binlog_prepare_pending_rows_event<Write_rows_log_event> + (table, server_id, cols, colcnt, len, is_trans); + + /* add_row_data copies row_data to internal buffer */ + error= likely(ev != 0) ? ev->add_row_data(row_data,len) : HA_ERR_OUT_OF_MEM ; + + if (table->write_row_record == 0) + my_free(row_data, MYF(MY_WME)); + + return error; +} + +int THD::binlog_update_row(TABLE* table, bool is_trans, + MY_BITMAP const* cols, my_size_t colcnt, + const byte *before_record, + const byte *after_record) +{ + DBUG_ASSERT(binlog_row_based && mysql_bin_log.is_open()); + + bool error= 0; + my_size_t const before_maxlen = max_row_length(table, before_record); + my_size_t const after_maxlen = max_row_length(table, after_record); + + byte *row_data= table->write_row_record; + byte *before_row, *after_row; + if (row_data != 0) + { + before_row= row_data; + after_row= before_row + before_maxlen; + } + else + { + if (unlikely(!(row_data= my_multi_malloc(MYF(MY_WME), + &before_row, before_maxlen, + &after_row, after_maxlen, + NULL)))) + return HA_ERR_OUT_OF_MEM; + } + + my_size_t const before_size= pack_row(table, cols, before_row, + before_record); + my_size_t const after_size= pack_row(table, cols, after_row, + after_record); + + Rows_log_event* const + ev= binlog_prepare_pending_rows_event<Update_rows_log_event> + (table, server_id, cols, colcnt, before_size + after_size, is_trans); + + error= (unlikely(!ev)) || ev->add_row_data(before_row, before_size) || + ev->add_row_data(after_row, after_size); + + if (!table->write_row_record) + { + /* add_row_data copies row_data to internal buffer */ + my_free(row_data, MYF(MY_WME)); + } + + return error; +} + +int THD::binlog_delete_row(TABLE* table, bool is_trans, + MY_BITMAP const* cols, my_size_t colcnt, + byte const *record) +{ + DBUG_ASSERT(binlog_row_based && mysql_bin_log.is_open()); + + /* + Pack records into format for transfer. We are allocating more + memory than needed, but that doesn't matter. + */ + bool error= 0; + my_size_t const max_len= max_row_length(table, record); + byte *row_data= table->write_row_record; + if (!row_data && unlikely(!(row_data= my_malloc(max_len, MYF(MY_WME))))) + return HA_ERR_OUT_OF_MEM; + my_size_t const len= pack_row(table, cols, row_data, record); + + Rows_log_event* const + ev= binlog_prepare_pending_rows_event<Delete_rows_log_event> + (table, server_id, cols, colcnt, len, is_trans); + + error= (unlikely(!ev)) || ev->add_row_data(row_data, len); + + /* add_row_data copies row_data */ + if (table->write_row_record == 0) + my_free(row_data, MYF(MY_WME)); + + return error; +} + + +int THD::binlog_flush_pending_rows_event(bool stmt_end) +{ + DBUG_ENTER("THD::binlog_flush_pending_rows_event"); + if (!binlog_row_based || !mysql_bin_log.is_open()) + DBUG_RETURN(0); + + /* + Mark the event as the last event of a statement if the stmt_end + flag is set. + */ + int error= 0; + if (Rows_log_event *pending= binlog_get_pending_rows_event()) + { + if (stmt_end) + { + pending->set_flags(Rows_log_event::STMT_END_F); + pending->flags|= LOG_EVENT_UPDATE_TABLE_MAP_VERSION_F; + } + + /* + We only bother to set the pending event if it is non-NULL. This + is essential for correctness, since there is not necessarily a + trx_data created for the thread if the pending event is NULL. + */ + error= mysql_bin_log.flush_and_set_pending_rows_event(this, 0); + } + + DBUG_RETURN(error); +} + + +void THD::binlog_delete_pending_rows_event() +{ + if (Rows_log_event *pending= binlog_get_pending_rows_event()) + { + delete pending; + binlog_set_pending_rows_event(0); + } +} + +#endif /* HAVE_ROW_BASED_REPLICATION */ + +/* + Member function that will log query, either row-based or + statement-based depending on the value of the 'binlog_row_based' + variable and the value of the 'qtype' flag. + + This function should be called after the all calls to ha_*_row() + functions have been issued, but before tables are unlocked and + closed. + + RETURN VALUE + Error code, or 0 if no error. +*/ +int THD::binlog_query(THD::enum_binlog_query_type qtype, + char const *query, ulong query_len, + bool is_trans, bool suppress_use) +{ + DBUG_ENTER("THD::binlog_query"); + DBUG_ASSERT(query && mysql_bin_log.is_open()); + int error= binlog_flush_pending_rows_event(true); + switch (qtype) + { + case THD::MYSQL_QUERY_TYPE: + /* + Using this query type is a conveniece hack, since we have been + moving back and forth between using RBR for replication of + system tables and not using it. + + Make sure to change in check_table_binlog_row_based() according + to how you treat this. + */ + case THD::ROW_QUERY_TYPE: + if (binlog_row_based) + DBUG_RETURN(binlog_flush_pending_rows_event(true)); + /* Otherwise, we fall through */ + case THD::STMT_QUERY_TYPE: + /* + Most callers of binlog_query() ignore the error code, assuming + that the statement will always be written to the binlog. In + case of error above, we therefore just continue and write the + statement to the binary log. + */ + { + Query_log_event qinfo(this, query, query_len, is_trans, suppress_use); + qinfo.flags|= LOG_EVENT_UPDATE_TABLE_MAP_VERSION_F; + DBUG_RETURN(mysql_bin_log.write(&qinfo)); + } + break; + + case THD::QUERY_TYPE_COUNT: + default: + DBUG_ASSERT(0 <= qtype && qtype < QUERY_TYPE_COUNT); + } + DBUG_RETURN(0); +} + +#endif /* !defined(MYSQL_CLIENT) */ diff --git a/sql/sql_class.h b/sql/sql_class.h index 60dc9a4cbad..1ef3322bc8f 100644 --- a/sql/sql_class.h +++ b/sql/sql_class.h @@ -21,19 +21,20 @@ #pragma interface /* gcc class implementation */ #endif -// TODO: create log.h and move all the log header stuff there +#include "log.h" +#include "rpl_rli.h" +#include "rpl_tblmap.h" class Query_log_event; class Load_log_event; class Slave_log_event; -class Format_description_log_event; class sp_rcontext; class sp_cache; +class Rows_log_event; enum enum_enable_or_disable { LEAVE_AS_IS, ENABLE, DISABLE }; enum enum_ha_read_modes { RFIRST, RNEXT, RPREV, RLAST, RKEY, RNEXT_SAME }; enum enum_duplicates { DUP_ERROR, DUP_REPLACE, DUP_UPDATE }; -enum enum_log_type { LOG_CLOSED, LOG_TO_BE_OPENED, LOG_NORMAL, LOG_NEW, LOG_BIN}; enum enum_delay_key_write { DELAY_KEY_WRITE_NONE, DELAY_KEY_WRITE_ON, DELAY_KEY_WRITE_ALL }; @@ -50,117 +51,6 @@ extern const char **errmesg; #define TC_HEURISTIC_RECOVER_ROLLBACK 2 extern uint tc_heuristic_recover; -/* - Transaction Coordinator log - a base abstract class - for two different implementations -*/ -class TC_LOG -{ - public: - int using_heuristic_recover(); - TC_LOG() {} - virtual ~TC_LOG() {} - - virtual int open(const char *opt_name)=0; - virtual void close()=0; - virtual int log(THD *thd, my_xid xid)=0; - virtual void unlog(ulong cookie, my_xid xid)=0; -}; - -class TC_LOG_DUMMY: public TC_LOG // use it to disable the logging -{ - public: - int open(const char *opt_name) { return 0; } - void close() { } - int log(THD *thd, my_xid xid) { return 1; } - void unlog(ulong cookie, my_xid xid) { } -}; - -#ifdef HAVE_MMAP -class TC_LOG_MMAP: public TC_LOG -{ - public: // only to keep Sun Forte on sol9x86 happy - typedef enum { - POOL, // page is in pool - ERROR, // last sync failed - DIRTY // new xids added since last sync - } PAGE_STATE; - - private: - typedef struct st_page { - struct st_page *next; // page a linked in a fifo queue - my_xid *start, *end; // usable area of a page - my_xid *ptr; // next xid will be written here - int size, free; // max and current number of free xid slots on the page - int waiters; // number of waiters on condition - PAGE_STATE state; // see above - pthread_mutex_t lock; // to access page data or control structure - pthread_cond_t cond; // to wait for a sync - } PAGE; - - char logname[FN_REFLEN]; - File fd; - my_off_t file_length; - uint npages, inited; - uchar *data; - struct st_page *pages, *syncing, *active, *pool, *pool_last; - /* - note that, e.g. LOCK_active is only used to protect - 'active' pointer, to protect the content of the active page - one has to use active->lock. - Same for LOCK_pool and LOCK_sync - */ - pthread_mutex_t LOCK_active, LOCK_pool, LOCK_sync; - pthread_cond_t COND_pool, COND_active; - - public: - TC_LOG_MMAP(): inited(0) {} - int open(const char *opt_name); - void close(); - int log(THD *thd, my_xid xid); - void unlog(ulong cookie, my_xid xid); - int recover(); - - private: - void get_active_from_pool(); - int sync(); - int overflow(); -}; -#else -#define TC_LOG_MMAP TC_LOG_DUMMY -#endif - -extern TC_LOG *tc_log; -extern TC_LOG_MMAP tc_log_mmap; -extern TC_LOG_DUMMY tc_log_dummy; - -/* log info errors */ -#define LOG_INFO_EOF -1 -#define LOG_INFO_IO -2 -#define LOG_INFO_INVALID -3 -#define LOG_INFO_SEEK -4 -#define LOG_INFO_MEM -6 -#define LOG_INFO_FATAL -7 -#define LOG_INFO_IN_USE -8 - -/* bitmap to SQL_LOG::close() */ -#define LOG_CLOSE_INDEX 1 -#define LOG_CLOSE_TO_BE_OPENED 2 -#define LOG_CLOSE_STOP_EVENT 4 - -struct st_relay_log_info; - -typedef struct st_log_info -{ - char log_file_name[FN_REFLEN]; - my_off_t index_file_offset, index_file_start_offset; - my_off_t pos; - bool fatal; // if the purge happens to give us a negative offset - pthread_mutex_t lock; - st_log_info():fatal(0) { pthread_mutex_init(&lock, MY_MUTEX_INIT_FAST);} - ~st_log_info() { pthread_mutex_destroy(&lock);} -} LOG_INFO; - typedef struct st_user_var_events { user_var_entry *user_var_event; @@ -173,188 +63,6 @@ typedef struct st_user_var_events #define RP_LOCK_LOG_IS_ALREADY_LOCKED 1 #define RP_FORCE_ROTATE 2 -class Log_event; - -/* - TODO split MYSQL_LOG into base MYSQL_LOG and - MYSQL_QUERY_LOG, MYSQL_SLOW_LOG, MYSQL_BIN_LOG - most of the code from MYSQL_LOG should be in the MYSQL_BIN_LOG - only (TC_LOG included) - - TODO use mmap instead of IO_CACHE for binlog - (mmap+fsync is two times faster than write+fsync) -*/ - -class MYSQL_LOG: public TC_LOG -{ - private: - /* LOCK_log and LOCK_index are inited by init_pthread_objects() */ - pthread_mutex_t LOCK_log, LOCK_index; - pthread_mutex_t LOCK_prep_xids; - pthread_cond_t COND_prep_xids; - pthread_cond_t update_cond; - ulonglong bytes_written; - time_t last_time,query_start; - IO_CACHE log_file; - IO_CACHE index_file; - char *name; - char time_buff[20],db[NAME_LEN+1]; - char log_file_name[FN_REFLEN],index_file_name[FN_REFLEN]; - /* - The max size before rotation (usable only if log_type == LOG_BIN: binary - logs and relay logs). - For a binlog, max_size should be max_binlog_size. - For a relay log, it should be max_relay_log_size if this is non-zero, - max_binlog_size otherwise. - max_size is set in init(), and dynamically changed (when one does SET - GLOBAL MAX_BINLOG_SIZE|MAX_RELAY_LOG_SIZE) by fix_max_binlog_size and - fix_max_relay_log_size). - */ - ulong max_size; - ulong prepared_xids; /* for tc log - number of xids to remember */ - volatile enum_log_type log_type; - enum cache_type io_cache_type; - // current file sequence number for load data infile binary logging - uint file_id; - uint open_count; // For replication - int readers_count; - bool write_error, inited; - bool need_start_event; - /* - no_auto_events means we don't want any of these automatic events : - Start/Rotate/Stop. That is, in 4.x when we rotate a relay log, we don't - want a Rotate_log event to be written to the relay log. When we start a - relay log etc. So in 4.x this is 1 for relay logs, 0 for binlogs. - In 5.0 it's 0 for relay logs too! - */ - bool no_auto_events; - friend class Log_event; - -public: - /* - These describe the log's format. This is used only for relay logs. - _for_exec is used by the SQL thread, _for_queue by the I/O thread. It's - necessary to have 2 distinct objects, because the I/O thread may be reading - events in a different format from what the SQL thread is reading (consider - the case of a master which has been upgraded from 5.0 to 5.1 without doing - RESET MASTER, or from 4.x to 5.0). - */ - Format_description_log_event *description_event_for_exec, - *description_event_for_queue; - - MYSQL_LOG(); - /* - note that there's no destructor ~MYSQL_LOG() ! - The reason is that we don't want it to be automatically called - on exit() - but only during the correct shutdown process - */ - - int open(const char *opt_name); - void close(); - int log(THD *thd, my_xid xid); - void unlog(ulong cookie, my_xid xid); - int recover(IO_CACHE *log, Format_description_log_event *fdle); - void reset_bytes_written() - { - bytes_written = 0; - } - void harvest_bytes_written(ulonglong* counter) - { -#ifndef DBUG_OFF - char buf1[22],buf2[22]; -#endif - DBUG_ENTER("harvest_bytes_written"); - (*counter)+=bytes_written; - DBUG_PRINT("info",("counter: %s bytes_written: %s", llstr(*counter,buf1), - llstr(bytes_written,buf2))); - bytes_written=0; - DBUG_VOID_RETURN; - } - void set_max_size(ulong max_size_arg); - void signal_update(); - void wait_for_update(THD* thd, bool master_or_slave); - void set_need_start_event() { need_start_event = 1; } - void init(enum_log_type log_type_arg, - enum cache_type io_cache_type_arg, - bool no_auto_events_arg, ulong max_size); - void init_pthread_objects(); - void cleanup(); - bool open(const char *log_name, - enum_log_type log_type, - const char *new_name, - enum cache_type io_cache_type_arg, - bool no_auto_events_arg, ulong max_size, - bool null_created); - const char *generate_name(const char *log_name, const char *suffix, - bool strip_ext, char *buff); - /* simplified open_xxx wrappers for the gigantic open above */ - bool open_query_log(const char *log_name) - { - char buf[FN_REFLEN]; - return open(generate_name(log_name, ".log", 0, buf), - LOG_NORMAL, 0, WRITE_CACHE, 0, 0, 0); - } - bool open_slow_log(const char *log_name) - { - char buf[FN_REFLEN]; - return open(generate_name(log_name, "-slow.log", 0, buf), - LOG_NORMAL, 0, WRITE_CACHE, 0, 0, 0); - } - bool open_index_file(const char *index_file_name_arg, - const char *log_name); - void new_file(bool need_lock); - bool write(THD *thd, enum enum_server_command command, - const char *format,...); - bool write(THD *thd, const char *query, uint query_length, - time_t query_start=0); - bool write(Log_event* event_info); // binary log write - bool write(THD *thd, IO_CACHE *cache, Log_event *commit_event); - - void start_union_events(THD *thd); - void stop_union_events(THD *thd); - bool is_query_in_union(THD *thd, query_id_t query_id_param); - - /* - v stands for vector - invoked as appendv(buf1,len1,buf2,len2,...,bufn,lenn,0) - */ - bool appendv(const char* buf,uint len,...); - bool append(Log_event* ev); - - int generate_new_name(char *new_name,const char *old_name); - void make_log_name(char* buf, const char* log_ident); - bool is_active(const char* log_file_name); - int update_log_index(LOG_INFO* linfo, bool need_update_threads); - void rotate_and_purge(uint flags); - bool flush_and_sync(); - int purge_logs(const char *to_log, bool included, - bool need_mutex, bool need_update_threads, - ulonglong *decrease_log_space); - int purge_logs_before_date(time_t purge_time); - int purge_first_log(struct st_relay_log_info* rli, bool included); - bool reset_logs(THD* thd); - void close(uint exiting); - - // iterating through the log index file - int find_log_pos(LOG_INFO* linfo, const char* log_name, - bool need_mutex); - int find_next_log(LOG_INFO* linfo, bool need_mutex); - int get_current_log(LOG_INFO* linfo); - uint next_file_id(); - inline bool is_open() { return log_type != LOG_CLOSED; } - inline char* get_index_fname() { return index_file_name;} - inline char* get_log_fname() { return log_file_name; } - inline char* get_name() { return name; } - inline pthread_mutex_t* get_log_lock() { return &LOCK_log; } - inline IO_CACHE* get_log_file() { return &log_file; } - - inline void lock_index() { pthread_mutex_lock(&LOCK_index);} - inline void unlock_index() { pthread_mutex_unlock(&LOCK_index);} - inline IO_CACHE *get_index_file() { return &index_file;} - inline uint32 get_open_count() { return open_count; } -}; - - typedef struct st_copy_info { ha_rows records; ha_rows deleted; @@ -461,28 +169,6 @@ public: #include "sql_lex.h" /* Must be here */ -/* Needed to be able to have an I_List of char* strings in mysqld.cc. */ - -class i_string: public ilink -{ -public: - const char* ptr; - i_string():ptr(0) { } - i_string(const char* s) : ptr(s) {} -}; - -/* needed for linked list of two strings for replicate-rewrite-db */ -class i_string_pair: public ilink -{ -public: - const char* key; - const char* val; - i_string_pair():key(0),val(0) { } - i_string_pair(const char* key_arg, const char* val_arg) : - key(key_arg),val(val_arg) {} -}; - - class delayed_insert; class select_result; @@ -1102,6 +788,9 @@ class THD :public Statement, public Open_tables_state { public: + /* Used to execute base64 coded binlog events in MySQL server */ + RELAY_LOG_INFO* rli_fake; + /* Constant for THD::where initialization in the beginning of every query. @@ -1206,12 +895,96 @@ public: /* container for handler's private per-connection data */ void *ha_data[MAX_HA]; + +#ifdef HAVE_ROW_BASED_REPLICATION +#ifndef MYSQL_CLIENT + + /* + Public interface to write rows to the binlog + */ + int binlog_write_row(TABLE* table, bool is_transactional, + MY_BITMAP const* cols, my_size_t colcnt, + const byte *buf); + int binlog_delete_row(TABLE* table, bool is_transactional, + MY_BITMAP const* cols, my_size_t colcnt, + const byte *buf); + int binlog_update_row(TABLE* table, bool is_transactional, + MY_BITMAP const* cols, my_size_t colcnt, + const byte *old_data, const byte *new_data); + + void set_server_id(uint32 sid) { server_id = sid; } + + /* + Member functions to handle pending event for row-level logging. + */ + template <class RowsEventT> Rows_log_event* + binlog_prepare_pending_rows_event(TABLE* table, uint32 serv_id, + MY_BITMAP const* cols, + my_size_t colcnt, + my_size_t needed, + bool is_transactional); + Rows_log_event* binlog_get_pending_rows_event() const; + void binlog_set_pending_rows_event(Rows_log_event* ev); + int binlog_setup_trx_data(); + + my_size_t max_row_length_blob(TABLE* table, const byte *data) const; + my_size_t max_row_length(TABLE* table, const byte *data) const + { + TABLE_SHARE *table_s= table->s; + my_size_t length= table_s->reclength + 2 * table_s->fields; + if (table_s->blob_fields == 0) + return length; + + return (length+max_row_length_blob(table,data)); + } + + my_size_t pack_row(TABLE* table, MY_BITMAP const* cols, byte *row_data, + const byte *data) const; + + int binlog_flush_pending_rows_event(bool stmt_end); + void binlog_delete_pending_rows_event(); + +#endif +#endif /* HAVE_ROW_BASED_REPLICATION */ +#ifndef MYSQL_CLIENT + enum enum_binlog_query_type { + /* + The query can be logged row-based or statement-based + */ + ROW_QUERY_TYPE, + + /* + The query has to be logged statement-based + */ + STMT_QUERY_TYPE, + + /* + The query represents a change to a table in the "mysql" + database and is currently mapped to ROW_QUERY_TYPE. + */ + MYSQL_QUERY_TYPE, + QUERY_TYPE_COUNT + }; + + int binlog_query(enum_binlog_query_type qtype, + char const *query, ulong query_len, + bool is_trans, bool suppress_use); +#endif + +public: + struct st_transactions { SAVEPOINT *savepoints; THD_TRANS all; // Trans since BEGIN WORK THD_TRANS stmt; // Trans for current statement bool on; // see ha_enable_transaction() + XID xid; // transaction identifier + enum xa_states xa_state; // used by external XA only XID_STATE xid_state; +#ifdef HAVE_ROW_BASED_REPLICATION + Rows_log_event *m_pending_rows_event; +#endif + /* Tables changed in transaction (that must be invalidated in query cache). List contain only transactional tables, that not invalidated in query @@ -1768,6 +1541,7 @@ class select_create: public select_insert { HA_CREATE_INFO *create_info; MYSQL_LOCK *lock; Field **field; + bool create_table_written; public: select_create (TABLE_LIST *table, HA_CREATE_INFO *create_info_par, @@ -1776,9 +1550,11 @@ public: List<Item> &select_fields,enum_duplicates duplic, bool ignore) :select_insert (NULL, NULL, &select_fields, 0, 0, duplic, ignore), create_table(table), extra_fields(&fields_par),keys(&keys_par), create_info(create_info_par), - lock(0) + lock(0), create_table_written(FALSE) {} int prepare(List<Item> &list, SELECT_LEX_UNIT *u); + + void binlog_show_create_table(); void store_values(List<Item> &values); void send_error(uint errcode,const char *err); bool send_eof(); diff --git a/sql/sql_delete.cc b/sql/sql_delete.cc index a9c3504250e..745139924ab 100644 --- a/sql/sql_delete.cc +++ b/sql/sql_delete.cc @@ -40,6 +40,7 @@ bool mysql_delete(THD *thd, TABLE_LIST *table_list, COND *conds, ha_rows deleted; uint usable_index= MAX_KEY; SELECT_LEX *select_lex= &thd->lex->select_lex; + bool ha_delete_row_bypassed= 0; DBUG_ENTER("mysql_delete"); if (open_and_lock_tables(thd, table_list)) @@ -77,15 +78,18 @@ bool mysql_delete(THD *thd, TABLE_LIST *table_list, COND *conds, !(specialflag & (SPECIAL_NO_NEW_FUNC | SPECIAL_SAFE_MODE)) && !(table->triggers && table->triggers->has_delete_triggers())) { - deleted= table->file->records; + ha_rows const maybe_deleted= table->file->records; if (!(error=table->file->delete_all_rows())) { error= -1; // ok + deleted= maybe_deleted; + ha_delete_row_bypassed= 1; goto cleanup; } if (error != HA_ERR_WRONG_COMMAND) { table->file->print_error(error,MYF(0)); + ha_delete_row_bypassed= 1; error=0; goto cleanup; } @@ -211,7 +215,7 @@ bool mysql_delete(THD *thd, TABLE_LIST *table_list, COND *conds, break; } - if (!(error= table->file->delete_row(table->record[0]))) + if (!(error= table->file->ha_delete_row(table->record[0]))) { deleted++; if (table->triggers && @@ -293,10 +297,24 @@ cleanup: { if (error < 0) thd->clear_error(); - Query_log_event qinfo(thd, thd->query, thd->query_length, - transactional_table, FALSE); - if (mysql_bin_log.write(&qinfo) && transactional_table) + + /* + If 'handler::delete_all_rows()' was called, we replicate + statement-based; otherwise, 'ha_delete_row()' was used to + delete specific rows which we might log row-based. + */ + THD::enum_binlog_query_type const + query_type(ha_delete_row_bypassed ? + THD::STMT_QUERY_TYPE : + THD::ROW_QUERY_TYPE); + int log_result= thd->binlog_query(query_type, + thd->query, thd->query_length, + transactional_table, FALSE); + + if (log_result && transactional_table) + { error=1; + } } if (!transactional_table) thd->options|=OPTION_STATUS_NO_TRANS_UPDATE; @@ -592,7 +610,7 @@ bool multi_delete::send_data(List<Item> &values) TRG_ACTION_BEFORE, FALSE)) DBUG_RETURN(1); table->status|= STATUS_DELETED; - if (!(error=table->file->delete_row(table->record[0]))) + if (!(error=table->file->ha_delete_row(table->record[0]))) { deleted++; if (table->triggers && @@ -705,7 +723,7 @@ int multi_delete::do_deletes() local_error= 1; break; } - if ((local_error=table->file->delete_row(table->record[0]))) + if ((local_error=table->file->ha_delete_row(table->record[0]))) { table->file->print_error(local_error,MYF(0)); break; @@ -772,10 +790,13 @@ bool multi_delete::send_eof() { if (local_error == 0) thd->clear_error(); - Query_log_event qinfo(thd, thd->query, thd->query_length, - transactional_tables, FALSE); - if (mysql_bin_log.write(&qinfo) && !normal_tables) + if (thd->binlog_query(THD::ROW_QUERY_TYPE, + thd->query, thd->query_length, + transactional_tables, FALSE) && + !normal_tables) + { local_error=1; // Log write failed: roll back the SQL statement + } } if (!transactional_tables) thd->options|=OPTION_STATUS_NO_TRANS_UPDATE; @@ -880,10 +901,13 @@ end: { if (mysql_bin_log.is_open()) { + /* + TRUNCATE must always be statement-based binlogged (not row-based) so + we don't test binlog_row_based. + */ thd->clear_error(); - Query_log_event qinfo(thd, thd->query, thd->query_length, - 0, FALSE); - mysql_bin_log.write(&qinfo); + thd->binlog_query(THD::STMT_QUERY_TYPE, + thd->query, thd->query_length, FALSE, FALSE); } send_ok(thd); // This should return record count } diff --git a/sql/sql_insert.cc b/sql/sql_insert.cc index 72a2f4a4f91..e053f06df55 100644 --- a/sql/sql_insert.cc +++ b/sql/sql_insert.cc @@ -21,6 +21,7 @@ #include "sp_head.h" #include "sql_trigger.h" #include "sql_select.h" +#include "sql_show.h" static int check_null_fields(THD *thd,TABLE *entry); #ifndef EMBEDDED_LIBRARY @@ -576,10 +577,13 @@ bool mysql_insert(THD *thd,TABLE_LIST *table_list, { if (error <= 0) thd->clear_error(); - Query_log_event qinfo(thd, thd->query, thd->query_length, - transactional_table, FALSE); - if (mysql_bin_log.write(&qinfo) && transactional_table) - error=1; + if (thd->binlog_query(THD::ROW_QUERY_TYPE, + thd->query, thd->query_length, + transactional_table, FALSE) && + transactional_table) + { + error=1; + } } if (!transactional_table) thd->options|=OPTION_STATUS_NO_TRANS_UPDATE; @@ -945,10 +949,11 @@ int write_record(THD *thd, TABLE *table,COPY_INFO *info) DBUG_ENTER("write_record"); info->records++; + if (info->handle_duplicates == DUP_REPLACE || info->handle_duplicates == DUP_UPDATE) { - while ((error=table->file->write_row(table->record[0]))) + while ((error=table->file->ha_write_row(table->record[0]))) { uint key_nr; if (error != HA_WRITE_SKIP) @@ -1032,7 +1037,7 @@ int write_record(THD *thd, TABLE *table,COPY_INFO *info) thd->clear_next_insert_id= 0; thd->next_insert_id= 0; } - if ((error=table->file->update_row(table->record[1],table->record[0]))) + if ((error=table->file->ha_update_row(table->record[1],table->record[0]))) { if ((error == HA_ERR_FOUND_DUPP_KEY) && info->ignore) goto ok_or_after_trg_err; @@ -1071,8 +1076,8 @@ int write_record(THD *thd, TABLE *table,COPY_INFO *info) thd->clear_next_insert_id= 0; thd->next_insert_id= 0; } - if ((error=table->file->update_row(table->record[1], - table->record[0]))) + if ((error=table->file->ha_update_row(table->record[1], + table->record[0]))) goto err; info->deleted++; trg_error= (table->triggers && @@ -1089,7 +1094,7 @@ int write_record(THD *thd, TABLE *table,COPY_INFO *info) table->triggers->process_triggers(thd, TRG_EVENT_DELETE, TRG_ACTION_BEFORE, TRUE)) goto before_trg_err; - if ((error=table->file->delete_row(table->record[1]))) + if ((error=table->file->ha_delete_row(table->record[1]))) goto err; info->deleted++; if (!table->file->has_transactions()) @@ -1110,7 +1115,7 @@ int write_record(THD *thd, TABLE *table,COPY_INFO *info) table->triggers->process_triggers(thd, TRG_EVENT_INSERT, TRG_ACTION_AFTER, TRUE)); } - else if ((error=table->file->write_row(table->record[0]))) + else if ((error=table->file->ha_write_row(table->record[0]))) { if (!info->ignore || (error != HA_ERR_FOUND_DUPP_KEY && error != HA_ERR_FOUND_DUPP_UNIQUE)) @@ -1196,16 +1201,15 @@ int check_that_all_fields_are_given_values(THD *thd, TABLE *entry, class delayed_row :public ilink { public: - char *record,*query; + char *record; enum_duplicates dup; time_t start_time; bool query_start_used,last_insert_id_used,insert_id_used, ignore, log_query; ulonglong last_insert_id; timestamp_auto_set_type timestamp_field_type; - uint query_length; delayed_row(enum_duplicates dup_arg, bool ignore_arg, bool log_query_arg) - :record(0), query(0), dup(dup_arg), ignore(ignore_arg), log_query(log_query_arg) {} + :record(0), dup(dup_arg), ignore(ignore_arg), log_query(log_query_arg) {} ~delayed_row() { x_free(record); @@ -1215,6 +1219,9 @@ public: class delayed_insert :public ilink { uint locks_in_memory; + char *query; + ulong query_length; + ulong query_allocated; public: THD thd; TABLE *table; @@ -1228,7 +1235,7 @@ public: TABLE_LIST table_list; // Argument delayed_insert() - :locks_in_memory(0), + :locks_in_memory(0), query(0), query_length(0), query_allocated(0), table(0),tables_in_use(0),stacked_inserts(0), status(0), dead(0), group_count(0) { @@ -1254,6 +1261,7 @@ public: } ~delayed_insert() { + my_free(query, MYF(MY_WME|MY_ALLOW_ZERO_PTR)); /* The following is not really needed, but just for safety */ delayed_row *row; while ((row=rows.get())) @@ -1273,6 +1281,25 @@ public: VOID(pthread_cond_broadcast(&COND_thread_count)); /* Tell main we are ready */ } + int set_query(char const *q, ulong qlen) { + if (q && qlen > 0) + { + if (query_allocated < qlen + 1) + { + ulong const flags(MY_WME|MY_FREE_ON_ERROR|MY_ALLOW_ZERO_PTR); + query= my_realloc(query, qlen + 1, MYF(flags)); + if (query == 0) + return HA_ERR_OUT_OF_MEM; + query_allocated= qlen; + } + query_length= qlen; + memcpy(query, q, qlen + 1); + } + else + query_length= 0; + return 0; + } + /* The following is for checking when we can delete ourselves */ inline void lock() { @@ -1562,18 +1589,22 @@ static int write_delayed(THD *thd,TABLE *table,enum_duplicates duplic, if (thd->killed || !(row= new delayed_row(duplic, ignore, log_on))) goto err; +#if 0 if (!query) query_length=0; - if (!(row->record= (char*) my_malloc(table->s->reclength+query_length+1, - MYF(MY_WME)))) +#endif + if (!(row->record= (char*) my_malloc(table->s->reclength, MYF(MY_WME)))) goto err; memcpy(row->record, table->record[0], table->s->reclength); + di->set_query(query, query_length); +#if 0 if (query_length) { row->query= row->record+table->s->reclength; memcpy(row->query,query,query_length+1); } row->query_length= query_length; +#endif row->start_time= thd->start_time; row->query_start_used= thd->query_start_used; row->last_insert_id_used= thd->last_insert_id_used; @@ -1897,7 +1928,21 @@ bool delayed_insert::handle_inserts(void) { int error; ulong max_rows; - bool using_ignore=0, using_bin_log=mysql_bin_log.is_open(); + bool using_ignore=0, + using_bin_log= mysql_bin_log.is_open(); + +#if 0 + /* + The actual text for the query is added to the first row in the + list. Since the row is destroyed, with all it's memory, we need + to take a copy of it to be able to log it after all rows have been + applied. + */ + uint const query_length= rows.head()->query_length; + char *const query= static_cast<char*>(my_alloca(query_length+1)); + memcpy(query, rows.head()->query, query_length); +#endif + delayed_row *row; DBUG_ENTER("handle_inserts"); @@ -1963,11 +2008,6 @@ bool delayed_insert::handle_inserts(void) using_ignore=0; table->file->extra(HA_EXTRA_NO_IGNORE_DUP_KEY); } - if (row->query && row->log_query && using_bin_log) - { - Query_log_event qinfo(&thd, row->query, row->query_length, 0, FALSE); - mysql_bin_log.write(&qinfo); - } if (table->s->blob_fields) free_delayed_insert_blobs(table); thread_safe_sub(delayed_rows_in_use,1,&LOCK_delayed_status); @@ -1982,8 +2022,7 @@ bool delayed_insert::handle_inserts(void) on this table until all entries has been processed */ if (group_count++ >= max_rows && (row= rows.head()) && - (!(row->log_query & using_bin_log) || - row->query)) + (!(row->log_query & using_bin_log))) { group_count=0; if (stacked_inserts || tables_in_use) // Let these wait a while @@ -2019,6 +2058,10 @@ bool delayed_insert::handle_inserts(void) thd.proc_info=0; table->next_number_field=0; pthread_mutex_unlock(&mutex); + + /* After releasing the mutex, to prevent deadlocks. */ + thd.binlog_query(THD::ROW_QUERY_TYPE, query, query_length, FALSE, FALSE); + if ((error=table->file->extra(HA_EXTRA_NO_CACHE))) { // This shouldn't happen table->file->print_error(error,MYF(0)); @@ -2216,6 +2259,16 @@ select_insert::prepare(List<Item> &values, SELECT_LEX_UNIT *u) check_that_all_fields_are_given_values(thd, table, table_list)) || table_list->prepare_where(thd, 0, TRUE) || table_list->prepare_check_option(thd)); + + /* + For non-transactional non-temporary tables, we set the + OPTION_STATUS_NO_TRANS_UPDATE flag here. The send_eof() function + is used by both the select_insert and the select_create classes, + so setting it there would clash. + */ + if (!(table->file->has_transactions() || table->s->tmp_table)) + thd->options|=OPTION_STATUS_NO_TRANS_UPDATE; + DBUG_RETURN(res); } @@ -2345,9 +2398,31 @@ void select_insert::send_error(uint errcode,const char *err) table->file->end_bulk_insert(); /* If at least one row has been inserted/modified and will stay in the table - (the table doesn't have transactions) (example: we got a duplicate key - error while inserting into a MyISAM table) we must write to the binlog (and + (the table doesn't have transactions) we must write to the binlog (and the error code will make the slave stop). + + For many errors (example: we got a duplicate key error while + inserting into a MyISAM table), no row will be added to the table, + so passing the error to the slave will not help since there will + be an error code mismatch (the inserts will succeed on the slave + with no error). + + If we are using row-based replication we have two cases where this + code is executed: replication of CREATE-SELECT and replication of + INSERT-SELECT. + + When replicating a CREATE-SELECT statement, we shall not write the + events to the binary log. To prevent the ha_rollback_stmt() below + from writing to the binary log, we have to pretend that the table + is transactional, even if it actually is not. Therefore, the + OPTION_STATUS_NO_TRANS_UPDATE is cleared in + select_create::prepare() and will remain cleared here. + + When replicating INSERT-SELECT, we shall not write the events to + the binary log for transactional table, but shall write all events + if there is one or more writes to non-transactional tables. In + this case, the OPTION_STATUS_NO_TRANS_UPDATE is set if there is a + write to a non-transactional table, otherwise it is cleared. */ if ((info.copied || info.deleted || info.updated) && !table->file->has_transactions()) @@ -2356,11 +2431,10 @@ void select_insert::send_error(uint errcode,const char *err) thd->insert_id(last_insert_id); // For binary log if (mysql_bin_log.is_open()) { - Query_log_event qinfo(thd, thd->query, thd->query_length, - table->file->has_transactions(), FALSE); - mysql_bin_log.write(&qinfo); + thd->binlog_query(THD::ROW_QUERY_TYPE, thd->query, thd->query_length, + table->file->has_transactions(), FALSE); } - if (!table->s->tmp_table) + if (!binlog_row_based && !table->s->tmp_table) thd->options|=OPTION_STATUS_NO_TRANS_UPDATE; } if (info.copied || info.deleted || info.updated) @@ -2382,26 +2456,36 @@ bool select_insert::send_eof() /* We must invalidate the table in the query cache before binlog writing - and ha_autocommit_or_rollback - */ + and ha_autocommit_or_rollback. + + If nothing was inserted in the table, there is no need to emit a + ROLLBACK statement to the binary log, so in that case we clear + OPTION_STATUS_NO_TRANS_UPDATE. + Observe that select_insert::send_eof() is used by both + select_insert and select_create and that they set the flag in + different manners. See Note 1 below for more info. + */ if (info.copied || info.deleted || info.updated) - { query_cache_invalidate3(thd, table, 1); - if (!(table->file->has_transactions() || table->s->tmp_table)) - thd->options|=OPTION_STATUS_NO_TRANS_UPDATE; - } + else + thd->options&= ~OPTION_STATUS_NO_TRANS_UPDATE; if (last_insert_id) thd->insert_id(last_insert_id); // For binary log - /* Write to binlog before commiting transaction */ + /* + Write to binlog before commiting transaction. No statement will + be written by the binlog_query() below in RBR mode. All the + events are in the transaction cache and will be written when + ha_autocommit_or_rollback() is issued below. + */ if (mysql_bin_log.is_open()) { if (!error) thd->clear_error(); - Query_log_event qinfo(thd, thd->query, thd->query_length, - table->file->has_transactions(), FALSE); - mysql_bin_log.write(&qinfo); + thd->binlog_query(THD::ROW_QUERY_TYPE, + thd->query, thd->query_length, + table->file->has_transactions(), FALSE); } if ((error2=ha_autocommit_or_rollback(thd,error)) && ! error) error=error2; @@ -2467,8 +2551,62 @@ select_create::prepare(List<Item> &values, SELECT_LEX_UNIT *u) } +void +select_create::binlog_show_create_table() +{ + /* + Note 1: In RBR mode, we generate a CREATE TABLE statement for the + created table by calling store_create_info() (behaves as SHOW + CREATE TABLE). In the event of an error, nothing should be + written to the binary log, even if the table is non-transactional; + therefore we pretend that the generated CREATE TABLE statement is + for a transactional table. The event will then be put in the + transaction cache, and any subsequent events (e.g., table-map + events and binrow events) will also be put there. We can then use + ha_autocommit_or_rollback() to either throw away the entire + kaboodle of events, or write them to the binary log. + + We write the CREATE TABLE statement here and not in prepare() + since there potentially are sub-selects or accesses to information + schema that will do a close_thread_tables(), destroying the + statement transaction cache. + + To ensure that the event kaboodle is not written to the binary log + on rollback, we clear the OPTION_STATUS_NO_TRANS_UPDATE bit of + thd->options. + */ + DBUG_ASSERT(binlog_row_based && !create_table_written); + + thd->options&= ~OPTION_STATUS_NO_TRANS_UPDATE; + char buf[2048]; + String query(buf, sizeof(buf), system_charset_info); + query.length(0); // Have to zero it since constructor doesn't + + TABLE_LIST tables; + memset(&tables, 0, sizeof(tables)); + tables.table = table; + + int result= store_create_info(thd, &tables, &query, create_info); + DBUG_ASSERT(result == 0); /* store_create_info() always return 0 */ + thd->binlog_query(THD::STMT_QUERY_TYPE, + query.ptr(), query.length(), + /* is_trans */ TRUE, + /* suppress_use */ FALSE); +} + + void select_create::store_values(List<Item> &values) { + /* + Before writing the first row, we write the CREATE TABLE statement + to the binlog. + */ + if (binlog_row_based && !create_table_written) + { + binlog_show_create_table(); + create_table_written= TRUE; + } + fill_record_n_invoke_before_triggers(thd, field, values, 1, table->triggers, TRG_EVENT_INSERT); } @@ -2488,6 +2626,16 @@ void select_create::send_error(uint errcode,const char *err) bool select_create::send_eof() { + /* + If no rows where written to the binary log, we write the CREATE + TABLE statement to the binlog. + */ + if (binlog_row_based && !create_table_written) + { + binlog_show_create_table(); + create_table_written= TRUE; + } + bool tmp=select_insert::send_eof(); if (tmp) abort(); diff --git a/sql/sql_lex.h b/sql/sql_lex.h index 303245b38bd..00ba075e922 100644 --- a/sql/sql_lex.h +++ b/sql/sql_lex.h @@ -93,7 +93,8 @@ enum enum_sql_command { SQLCOM_XA_COMMIT, SQLCOM_XA_ROLLBACK, SQLCOM_XA_RECOVER, SQLCOM_SHOW_PROC_CODE, SQLCOM_SHOW_FUNC_CODE, SQLCOM_INSTALL_PLUGIN, SQLCOM_UNINSTALL_PLUGIN, - SQLCOM_SHOW_AUTHORS, SQLCOM_SHOW_PLUGINS, + SQLCOM_SHOW_AUTHORS, SQLCOM_BINLOG_BASE64_EVENT, + SQLCOM_SHOW_PLUGINS, /* This should be the last !!! */ SQLCOM_END diff --git a/sql/sql_list.h b/sql/sql_list.h index b2bcc4ea401..05f589a2c23 100644 --- a/sql/sql_list.h +++ b/sql/sql_list.h @@ -441,6 +441,28 @@ struct ilink }; +/* Needed to be able to have an I_List of char* strings in mysqld.cc. */ + +class i_string: public ilink +{ +public: + const char* ptr; + i_string():ptr(0) { } + i_string(const char* s) : ptr(s) {} +}; + +/* needed for linked list of two strings for replicate-rewrite-db */ +class i_string_pair: public ilink +{ +public: + const char* key; + const char* val; + i_string_pair():key(0),val(0) { } + i_string_pair(const char* key_arg, const char* val_arg) : + key(key_arg),val(val_arg) {} +}; + + template <class T> class I_List_iterator; /* diff --git a/sql/sql_load.cc b/sql/sql_load.cc index 09bcb9cb9fe..70abe3e659c 100644 --- a/sql/sql_load.cc +++ b/sql/sql_load.cc @@ -414,38 +414,55 @@ bool mysql_load(THD *thd,sql_exchange *ex,TABLE_LIST *table_list, #ifndef EMBEDDED_LIBRARY if (mysql_bin_log.is_open()) { +#ifdef HAVE_ROW_BASED_REPLICATION /* - Make sure last block (the one which caused the error) gets logged. - This is needed because otherwise after write of - (to the binlog, not to read_info (which is a cache)) - Delete_file_log_event the bad block will remain in read_info (because - pre_read is not called at the end of the last block; remember pre_read - is called whenever a new block is read from disk). - At the end of mysql_load(), the destructor of read_info will call - end_io_cache() which will flush read_info, so we will finally have - this in the binlog: - Append_block # The last successfull block - Delete_file - Append_block # The failing block - which is nonsense. - Or could also be (for a small file) - Create_file # The failing block - which is nonsense (Delete_file is not written in this case, because: - Create_file has not been written, so Delete_file is not written, then - when read_info is destroyed end_io_cache() is called which writes - Create_file. + We need to do the job that is normally done inside + binlog_query() here, which is to ensure that the pending event + is written before tables are unlocked and before any other + events are written. We also need to update the table map + version for the binary log to mark that table maps are invalid + after this point. */ - read_info.end_io_cache(); - /* If the file was not empty, wrote_create_file is true */ - if (lf_info.wrote_create_file) + if (binlog_row_based) + thd->binlog_flush_pending_rows_event(true); + else +#endif { - if ((info.copied || info.deleted) && !transactional_table) - write_execute_load_query_log_event(thd, handle_duplicates, - ignore, transactional_table); - else + /* + Make sure last block (the one which caused the error) gets + logged. This is needed because otherwise after write of (to + the binlog, not to read_info (which is a cache)) + Delete_file_log_event the bad block will remain in read_info + (because pre_read is not called at the end of the last + block; remember pre_read is called whenever a new block is + read from disk). At the end of mysql_load(), the destructor + of read_info will call end_io_cache() which will flush + read_info, so we will finally have this in the binlog: + + Append_block # The last successfull block + Delete_file + Append_block # The failing block + which is nonsense. + Or could also be (for a small file) + Create_file # The failing block + which is nonsense (Delete_file is not written in this case, because: + Create_file has not been written, so Delete_file is not written, then + when read_info is destroyed end_io_cache() is called which writes + Create_file. + */ + read_info.end_io_cache(); + /* If the file was not empty, wrote_create_file is true */ + if (lf_info.wrote_create_file) { - Delete_file_log_event d(thd, db, transactional_table); - mysql_bin_log.write(&d); + if ((info.copied || info.deleted) && !transactional_table) + write_execute_load_query_log_event(thd, handle_duplicates, + ignore, transactional_table); + else + { + Delete_file_log_event d(thd, db, transactional_table); + d.flags|= LOG_EVENT_UPDATE_TABLE_MAP_VERSION_F; + mysql_bin_log.write(&d); + } } } } @@ -462,15 +479,32 @@ bool mysql_load(THD *thd,sql_exchange *ex,TABLE_LIST *table_list, #ifndef EMBEDDED_LIBRARY if (mysql_bin_log.is_open()) { +#ifdef HAVE_ROW_BASED_REPLICATION /* - As already explained above, we need to call end_io_cache() or the last - block will be logged only after Execute_load_query_log_event (which is - wrong), when read_info is destroyed. - */ - read_info.end_io_cache(); - if (lf_info.wrote_create_file) - write_execute_load_query_log_event(thd, handle_duplicates, - ignore, transactional_table); + We need to do the job that is normally done inside + binlog_query() here, which is to ensure that the pending event + is written before tables are unlocked and before any other + events are written. We also need to update the table map + version for the binary log to mark that table maps are invalid + after this point. + */ + if (binlog_row_based) + thd->binlog_flush_pending_rows_event(true); + else +#endif + { + /* + As already explained above, we need to call end_io_cache() or the last + block will be logged only after Execute_load_query_log_event (which is + wrong), when read_info is destroyed. + */ + read_info.end_io_cache(); + if (lf_info.wrote_create_file) + { + write_execute_load_query_log_event(thd, handle_duplicates, + ignore, transactional_table); + } + } } #endif /*!EMBEDDED_LIBRARY*/ if (transactional_table) @@ -499,6 +533,7 @@ static bool write_execute_load_query_log_event(THD *thd, (duplicates == DUP_REPLACE) ? LOAD_DUP_REPLACE : (ignore ? LOAD_DUP_IGNORE : LOAD_DUP_ERROR), transactional_table, FALSE); + e.flags|= LOG_EVENT_UPDATE_TABLE_MAP_VERSION_F; return mysql_bin_log.write(&e); } @@ -910,7 +945,7 @@ READ_INFO::READ_INFO(File file_par, uint tot_length, CHARSET_INFO *cs, if (get_it_from_net) cache.read_function = _my_b_net_read; - if (mysql_bin_log.is_open()) + if (!binlog_row_based && mysql_bin_log.is_open()) cache.pre_read = cache.pre_close = (IO_CACHE_CALLBACK) log_loaded_block; #endif diff --git a/sql/sql_parse.cc b/sql/sql_parse.cc index cf098f1b414..839e1dbd65f 100644 --- a/sql/sql_parse.cc +++ b/sql/sql_parse.cc @@ -3712,8 +3712,8 @@ end_with_restore_list: { if (mysql_bin_log.is_open()) { - Query_log_event qinfo(thd, thd->query, thd->query_length, 0, FALSE); - mysql_bin_log.write(&qinfo); + thd->binlog_query(THD::MYSQL_QUERY_TYPE, + thd->query, thd->query_length, FALSE, FALSE); } send_ok(thd); } @@ -3730,8 +3730,8 @@ end_with_restore_list: { if (mysql_bin_log.is_open()) { - Query_log_event qinfo(thd, thd->query, thd->query_length, 0, FALSE); - mysql_bin_log.write(&qinfo); + thd->binlog_query(THD::MYSQL_QUERY_TYPE, + thd->query, thd->query_length, FALSE, FALSE); } send_ok(thd); } @@ -3748,8 +3748,8 @@ end_with_restore_list: { if (mysql_bin_log.is_open()) { - Query_log_event qinfo(thd, thd->query, thd->query_length, 0, FALSE); - mysql_bin_log.write(&qinfo); + thd->binlog_query(THD::MYSQL_QUERY_TYPE, + thd->query, thd->query_length, FALSE, FALSE); } send_ok(thd); } @@ -3764,8 +3764,8 @@ end_with_restore_list: { if (mysql_bin_log.is_open()) { - Query_log_event qinfo(thd, thd->query, thd->query_length, 0, FALSE); - mysql_bin_log.write(&qinfo); + thd->binlog_query(THD::MYSQL_QUERY_TYPE, + thd->query, thd->query_length, FALSE, FALSE); } send_ok(thd); } @@ -3844,8 +3844,8 @@ end_with_restore_list: if (!res && mysql_bin_log.is_open()) { thd->clear_error(); - Query_log_event qinfo(thd, thd->query, thd->query_length, 0, FALSE); - mysql_bin_log.write(&qinfo); + thd->binlog_query(THD::MYSQL_QUERY_TYPE, + thd->query, thd->query_length, FALSE, FALSE); } } else @@ -3864,8 +3864,8 @@ end_with_restore_list: if (mysql_bin_log.is_open()) { thd->clear_error(); - Query_log_event qinfo(thd, thd->query, thd->query_length, 0, FALSE); - mysql_bin_log.write(&qinfo); + thd->binlog_query(THD::MYSQL_QUERY_TYPE, + thd->query, thd->query_length, FALSE, FALSE); } if (lex->sql_command == SQLCOM_GRANT) { @@ -4161,12 +4161,12 @@ end_with_restore_list: db, name, lex->sql_command == SQLCOM_CREATE_PROCEDURE, 1)) { - close_thread_tables(thd); if (sp_grant_privileges(thd, db, name, lex->sql_command == SQLCOM_CREATE_PROCEDURE)) push_warning(thd, MYSQL_ERROR::WARN_LEVEL_WARN, ER_PROC_AUTO_GRANT_FAIL, ER(ER_PROC_AUTO_GRANT_FAIL)); + close_thread_tables(thd); } #endif send_ok(thd); @@ -4394,8 +4394,8 @@ end_with_restore_list: if (mysql_bin_log.is_open()) { thd->clear_error(); - Query_log_event qinfo(thd, thd->query, thd->query_length, 0, FALSE); - mysql_bin_log.write(&qinfo); + thd->binlog_query(THD::MYSQL_QUERY_TYPE, + thd->query, thd->query_length, FALSE, FALSE); } send_ok(thd); break; @@ -4483,8 +4483,8 @@ end_with_restore_list: if (mysql_bin_log.is_open()) { thd->clear_error(); - Query_log_event qinfo(thd, thd->query, thd->query_length, 0, FALSE); - mysql_bin_log.write(&qinfo); + thd->binlog_query(THD::MYSQL_QUERY_TYPE, + thd->query, thd->query_length, FALSE, FALSE); } send_ok(thd); break; @@ -4608,8 +4608,8 @@ end_with_restore_list: buff.append(STRING_WITH_LEN(" AS ")); buff.append(first_table->source.str, first_table->source.length); - Query_log_event qinfo(thd, buff.ptr(), buff.length(), 0, FALSE); - mysql_bin_log.write(&qinfo); + thd->binlog_query(THD::STMT_QUERY_TYPE, + buff.ptr(), buff.length(), FALSE, FALSE); } break; } @@ -4622,8 +4622,8 @@ end_with_restore_list: mysql_bin_log.is_open()) { thd->clear_error(); - Query_log_event qinfo(thd, thd->query, thd->query_length, 0, FALSE); - mysql_bin_log.write(&qinfo); + thd->binlog_query(THD::STMT_QUERY_TYPE, + thd->query, thd->query_length, FALSE, FALSE); } break; } @@ -4826,15 +4826,24 @@ end_with_restore_list: if (! (res= mysql_uninstall_plugin(thd, &thd->lex->comment))) send_ok(thd); break; + case SQLCOM_BINLOG_BASE64_EVENT: + { +#ifndef EMBEDDED_LIBRARY + mysql_client_binlog_statement(thd); +#else /* EMBEDDED_LIBRARY */ + my_error(ER_OPTION_PREVENTS_STATEMENT, MYF(0), "embedded"); +#endif /* EMBEDDED_LIBRARY */ + break; + } default: DBUG_ASSERT(0); /* Impossible */ send_ok(thd); break; } thd->proc_info="query end"; - /* Two binlog-related cleanups: */ /* + Binlog-related cleanup: Reset system variables temporarily modified by SET ONE SHOT. Exception: If this is a SET, do nothing. This is to allow @@ -5571,7 +5580,6 @@ void mysql_init_multi_delete(LEX *lex) lex->query_tables_last= &lex->query_tables; } - /* When you modify mysql_parse(), you may need to mofify mysql_test_parse_for_slave() in this same file. diff --git a/sql/sql_plugin.cc b/sql/sql_plugin.cc index 591289f6ee1..82bd6b2c499 100644 --- a/sql/sql_plugin.cc +++ b/sql/sql_plugin.cc @@ -625,7 +625,7 @@ my_bool mysql_install_plugin(THD *thd, LEX_STRING *name, LEX_STRING *dl) restore_record(table, s->default_values); table->field[0]->store(name->str, name->length, system_charset_info); table->field[1]->store(dl->str, dl->length, files_charset_info); - error= table->file->write_row(table->record[0]); + error= table->file->ha_write_row(table->record[0]); if (error) { table->file->print_error(error, MYF(0)); @@ -694,7 +694,7 @@ my_bool mysql_uninstall_plugin(THD *thd, LEX_STRING *name) HA_READ_KEY_EXACT)) { int error; - if ((error= table->file->delete_row(table->record[0]))) + if ((error= table->file->ha_delete_row(table->record[0]))) { table->file->print_error(error, MYF(0)); goto err; diff --git a/sql/sql_rename.cc b/sql/sql_rename.cc index 2c8c732fe86..a1bbb69bc17 100644 --- a/sql/sql_rename.cc +++ b/sql/sql_rename.cc @@ -84,8 +84,8 @@ bool mysql_rename_tables(THD *thd, TABLE_LIST *table_list) if (mysql_bin_log.is_open()) { thd->clear_error(); - Query_log_event qinfo(thd, thd->query, thd->query_length, 0, FALSE); - mysql_bin_log.write(&qinfo); + thd->binlog_query(THD::STMT_QUERY_TYPE, + thd->query, thd->query_length, FALSE, FALSE); } send_ok(thd); } diff --git a/sql/sql_repl.h b/sql/sql_repl.h index ba64e626adc..789de64da85 100644 --- a/sql/sql_repl.h +++ b/sql/sql_repl.h @@ -14,6 +14,8 @@ along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#include "rpl_filter.h" + #ifdef HAVE_REPLICATION #include "slave.h" diff --git a/sql/sql_select.cc b/sql/sql_select.cc index 7b12069b8ec..f2833f94400 100644 --- a/sql/sql_select.cc +++ b/sql/sql_select.cc @@ -9288,11 +9288,11 @@ bool create_myisam_from_heap(THD *thd, TABLE *table, TMP_TABLE_PARAM *param, */ while (!table->file->rnd_next(new_table.record[1])) { - if ((write_err=new_table.file->write_row(new_table.record[1]))) + if ((write_err=new_table.file->ha_write_row(new_table.record[1]))) goto err; } /* copy row that filled HEAP table */ - if ((write_err=new_table.file->write_row(table->record[0]))) + if ((write_err=new_table.file->ha_write_row(table->record[0]))) { if (write_err != HA_ERR_FOUND_DUPP_KEY && write_err != HA_ERR_FOUND_DUPP_UNIQUE || !ignore_last_dupp_key_error) @@ -10691,7 +10691,7 @@ end_write(JOIN *join, JOIN_TAB *join_tab __attribute__((unused)), { int error; join->found_records++; - if ((error=table->file->write_row(table->record[0]))) + if ((error=table->file->ha_write_row(table->record[0]))) { if (error == HA_ERR_FOUND_DUPP_KEY || error == HA_ERR_FOUND_DUPP_UNIQUE) @@ -10753,8 +10753,8 @@ end_update(JOIN *join, JOIN_TAB *join_tab __attribute__((unused)), { /* Update old record */ restore_record(table,record[1]); update_tmptable_sum_func(join->sum_funcs,table); - if ((error=table->file->update_row(table->record[1], - table->record[0]))) + if ((error=table->file->ha_update_row(table->record[1], + table->record[0]))) { table->file->print_error(error,MYF(0)); /* purecov: inspected */ DBUG_RETURN(NESTED_LOOP_ERROR); /* purecov: inspected */ @@ -10777,7 +10777,7 @@ end_update(JOIN *join, JOIN_TAB *join_tab __attribute__((unused)), } init_tmptable_sum_functions(join->sum_funcs); copy_funcs(join->tmp_table_param.items_to_copy); - if ((error=table->file->write_row(table->record[0]))) + if ((error=table->file->ha_write_row(table->record[0]))) { if (create_myisam_from_heap(join->thd, table, &join->tmp_table_param, error, 0)) @@ -10813,7 +10813,7 @@ end_unique_update(JOIN *join, JOIN_TAB *join_tab __attribute__((unused)), copy_fields(&join->tmp_table_param); // Groups are copied twice. copy_funcs(join->tmp_table_param.items_to_copy); - if (!(error=table->file->write_row(table->record[0]))) + if (!(error=table->file->ha_write_row(table->record[0]))) join->send_records++; // New group else { @@ -10829,8 +10829,8 @@ end_unique_update(JOIN *join, JOIN_TAB *join_tab __attribute__((unused)), } restore_record(table,record[1]); update_tmptable_sum_func(join->sum_funcs,table); - if ((error=table->file->update_row(table->record[1], - table->record[0]))) + if ((error=table->file->ha_update_row(table->record[1], + table->record[0]))) { table->file->print_error(error,MYF(0)); /* purecov: inspected */ DBUG_RETURN(NESTED_LOOP_ERROR); /* purecov: inspected */ @@ -10873,7 +10873,7 @@ end_write_group(JOIN *join, JOIN_TAB *join_tab __attribute__((unused)), join->sum_funcs_end[send_group_parts]); if (!join->having || join->having->val_int()) { - int error= table->file->write_row(table->record[0]); + int error= table->file->ha_write_row(table->record[0]); if (error && create_myisam_from_heap(join->thd, table, &join->tmp_table_param, error, 0)) @@ -11735,7 +11735,7 @@ static int remove_dup_with_compare(THD *thd, TABLE *table, Field **first_field, } if (having && !having->val_int()) { - if ((error=file->delete_row(record))) + if ((error=file->ha_delete_row(record))) goto err; error=file->rnd_next(record); continue; @@ -11762,7 +11762,7 @@ static int remove_dup_with_compare(THD *thd, TABLE *table, Field **first_field, } if (compare_record(table, first_field) == 0) { - if ((error=file->delete_row(record))) + if ((error=file->ha_delete_row(record))) goto err; } else if (!found) @@ -11859,7 +11859,7 @@ static int remove_dup_with_hash_index(THD *thd, TABLE *table, } if (having && !having->val_int()) { - if ((error=file->delete_row(record))) + if ((error=file->ha_delete_row(record))) goto err; continue; } @@ -11876,7 +11876,7 @@ static int remove_dup_with_hash_index(THD *thd, TABLE *table, if (hash_search(&hash, org_key_pos, key_length)) { /* Duplicated found ; Remove the row */ - if ((error=file->delete_row(record))) + if ((error=file->ha_delete_row(record))) goto err; } else @@ -13702,7 +13702,7 @@ int JOIN::rollup_write_data(uint idx, TABLE *table) item->save_in_result_field(1); } copy_sum_funcs(sum_funcs_end[i+1], sum_funcs_end[i]); - if ((error= table->file->write_row(table->record[0]))) + if ((error= table->file->ha_write_row(table->record[0]))) { if (create_myisam_from_heap(thd, table, &tmp_table_param, error, 0)) diff --git a/sql/sql_show.cc b/sql/sql_show.cc index d3649aa5f6b..42345afbd0d 100644 --- a/sql/sql_show.cc +++ b/sql/sql_show.cc @@ -19,6 +19,7 @@ #include "mysql_priv.h" #include "sql_select.h" // For select_describe +#include "sql_show.h" #include "repl_failsafe.h" #include "sp.h" #include "sp_head.h" @@ -37,10 +38,6 @@ static TYPELIB grant_types = { sizeof(grant_names)/sizeof(char **), grant_names, NULL}; #endif -static int -store_create_info(THD *thd, TABLE_LIST *table_list, String *packet); -static int -view_store_create_info(THD *thd, TABLE_LIST *table, String *buff); static bool schema_table_store_record(THD *thd, TABLE *table); @@ -540,7 +537,7 @@ mysqld_show_create(THD *thd, TABLE_LIST *table_list) buffer.length(0); if ((table_list->view ? view_store_create_info(thd, table_list, &buffer) : - store_create_info(thd, table_list, &buffer))) + store_create_info(thd, table_list, &buffer, NULL))) DBUG_RETURN(TRUE); List<Item> field_list; @@ -719,7 +716,7 @@ mysqld_dump_create_info(THD *thd, TABLE_LIST *table_list, int fd) DBUG_PRINT("enter",("table: %s",table_list->table->s->table_name.str)); protocol->prepare_for_resend(); - if (store_create_info(thd, table_list, packet)) + if (store_create_info(thd, table_list, packet, NULL)) DBUG_RETURN(-1); if (fd < 0) @@ -872,8 +869,31 @@ static void append_directory(THD *thd, String *packet, const char *dir_type, #define LIST_PROCESS_HOST_LEN 64 -static int -store_create_info(THD *thd, TABLE_LIST *table_list, String *packet) +/* + Build a CREATE TABLE statement for a table. + + SYNOPSIS + store_create_info() + thd The thread + table_list A list containing one table to write statement + for. + packet Pointer to a string where statement will be + written. + create_info_arg Pointer to create information that can be used + to tailor the format of the statement. Can be + NULL, in which case only SQL_MODE is considered + when building the statement. + + NOTE + Currently always return 0, but might return error code in the + future. + + RETURN + 0 OK + */ +int +store_create_info(THD *thd, TABLE_LIST *table_list, String *packet, + HA_CREATE_INFO *create_info_arg) { List<Item> field_list; char tmp[MAX_FIELD_WIDTH], *for_str, buff[128], *end; @@ -1108,10 +1128,17 @@ store_create_info(THD *thd, TABLE_LIST *table_list, String *packet) packet->append(STRING_WITH_LEN("\n)")); if (!(thd->variables.sql_mode & MODE_NO_TABLE_OPTIONS) && !foreign_db_mode) { - if (thd->variables.sql_mode & (MODE_MYSQL323 | MODE_MYSQL40)) - packet->append(STRING_WITH_LEN(" TYPE=")); - else - packet->append(STRING_WITH_LEN(" ENGINE=")); + /* + IF check_create_info + THEN add ENGINE only if it was used when creating the table + */ + if (!create_info_arg || + (create_info_arg->used_fields & HA_CREATE_USED_ENGINE)) + { + if (thd->variables.sql_mode & (MODE_MYSQL323 | MODE_MYSQL40)) + packet->append(STRING_WITH_LEN(" TYPE=")); + else + packet->append(STRING_WITH_LEN(" ENGINE=")); #ifdef WITH_PARTITION_STORAGE_ENGINE if (table->part_info) packet->append(ha_resolve_storage_engine_name( @@ -1119,19 +1146,28 @@ store_create_info(THD *thd, TABLE_LIST *table_list, String *packet) else packet->append(file->table_type()); #else - packet->append(file->table_type()); + packet->append(file->table_type()); #endif + } if (share->table_charset && !(thd->variables.sql_mode & MODE_MYSQL323) && !(thd->variables.sql_mode & MODE_MYSQL40)) { - packet->append(STRING_WITH_LEN(" DEFAULT CHARSET=")); - packet->append(share->table_charset->csname); - if (!(share->table_charset->state & MY_CS_PRIMARY)) + /* + IF check_create_info + THEN add DEFAULT CHARSET only if it was used when creating the table + */ + if (!create_info_arg || + (create_info_arg->used_fields & HA_CREATE_USED_DEFAULT_CHARSET)) { - packet->append(STRING_WITH_LEN(" COLLATE=")); - packet->append(table->s->table_charset->name); + packet->append(STRING_WITH_LEN(" DEFAULT CHARSET=")); + packet->append(share->table_charset->csname); + if (!(share->table_charset->state & MY_CS_PRIMARY)) + { + packet->append(STRING_WITH_LEN(" COLLATE=")); + packet->append(table->s->table_charset->name); + } } } @@ -1236,7 +1272,6 @@ view_store_options(THD *thd, TABLE_LIST *table, String *buff) buff->append(STRING_WITH_LEN("SQL SECURITY INVOKER ")); } - /* Append DEFINER clause to the given buffer. @@ -1259,7 +1294,7 @@ void append_definer(THD *thd, String *buffer, const LEX_STRING *definer_user, } -static int +int view_store_create_info(THD *thd, TABLE_LIST *table, String *buff) { my_bool foreign_db_mode= (thd->variables.sql_mode & (MODE_POSTGRESQL | @@ -1924,7 +1959,7 @@ typedef struct st_index_field_values static bool schema_table_store_record(THD *thd, TABLE *table) { int error; - if ((error= table->file->write_row(table->record[0]))) + if ((error= table->file->ha_write_row(table->record[0]))) { if (create_myisam_from_heap(thd, table, table->pos_in_table_list->schema_table_param, diff --git a/sql/sql_show.h b/sql/sql_show.h new file mode 100644 index 00000000000..6fce5e94ca3 --- /dev/null +++ b/sql/sql_show.h @@ -0,0 +1,17 @@ + +#ifndef SQL_SHOW_H +#define SQL_SHOW_H + +/* Forward declarations */ +class String; +class THD; +struct st_ha_create_information; +struct st_table_list; +typedef st_ha_create_information HA_CREATE_INFO; +typedef st_table_list TABLE_LIST; + +int store_create_info(THD *thd, TABLE_LIST *table_list, String *packet, + HA_CREATE_INFO *create_info_arg); +int view_store_create_info(THD *thd, TABLE_LIST *table, String *buff); + +#endif /* SQL_SHOW_H */ diff --git a/sql/sql_table.cc b/sql/sql_table.cc index 0e811d63b36..3ac0d3ae466 100644 --- a/sql/sql_table.cc +++ b/sql/sql_table.cc @@ -22,6 +22,7 @@ #include <my_dir.h> #include "sp_head.h" #include "sql_trigger.h" +#include "sql_show.h" #ifdef __WIN__ #include <io.h> @@ -53,14 +54,15 @@ static bool check_engine(THD *thd, const char *table_name, file */ -static void write_bin_log(THD *thd, bool clear_error) +static void write_bin_log(THD *thd, bool clear_error, + char const* query, ulong query_length) { if (mysql_bin_log.is_open()) { if (clear_error) thd->clear_error(); - Query_log_event qinfo(thd, thd->query, thd->query_length, FALSE, FALSE); - mysql_bin_log.write(&qinfo); + thd->binlog_query(THD::STMT_QUERY_TYPE, + query, query_length, FALSE, FALSE); } } @@ -277,9 +279,19 @@ int mysql_rm_table_part2(THD *thd, TABLE_LIST *tables, bool if_exists, char path[FN_REFLEN], *alias; String wrong_tables; int error; + int non_temp_tables_count= 0; bool some_tables_deleted=0, tmp_table_deleted=0, foreign_key_error=0; + String built_query; DBUG_ENTER("mysql_rm_table_part2"); + if (binlog_row_based && !dont_log_query) + { + built_query.set_charset(system_charset_info); + if (if_exists) + built_query.append("DROP TABLE IF EXISTS "); + else + built_query.append("DROP TABLE "); + } /* If we have the table in the definition cache, we don't have to check the .frm file to find if the table is a normal table (not view) and what @@ -313,6 +325,30 @@ int mysql_rm_table_part2(THD *thd, TABLE_LIST *tables, bool if_exists, continue; // removed temporary table } + /* + If row-based replication is used and the table is not a + temporary table, we add the table name to the drop statement + being built. The string always end in a comma and the comma + will be chopped off before being written to the binary log. + */ + if (binlog_row_based && !dont_log_query) + { + ++non_temp_tables_count; + /* + Don't write the database name if it is the current one (or if + thd->db is NULL). + */ + built_query.append("`"); + if (thd->db == NULL || strcmp(db,thd->db) != 0) + { + built_query.append(db); + built_query.append("`.`"); + } + + built_query.append(table->table_name); + built_query.append("`,"); + } + error=0; table_type= table->db_type; if (!drop_temporary) @@ -401,12 +437,48 @@ int mysql_rm_table_part2(THD *thd, TABLE_LIST *tables, bool if_exists, if (some_tables_deleted || tmp_table_deleted || !error) { query_cache_invalidate3(thd, tables, 0); - if (!dont_log_query && mysql_bin_log.is_open()) + if (!dont_log_query) { - if (!error) - thd->clear_error(); - Query_log_event qinfo(thd, thd->query, thd->query_length, FALSE, FALSE); - mysql_bin_log.write(&qinfo); + if (!binlog_row_based || + non_temp_tables_count > 0 && !tmp_table_deleted) + { + /* + In this case, we are either using statement-based + replication or using row-based replication but have only + deleted one or more non-temporary tables (and no temporary + tables). In this case, we can write the original query into + the binary log. + */ + write_bin_log(thd, !error, thd->query, thd->query_length); + } + else if (binlog_row_based && + non_temp_tables_count > 0 && + tmp_table_deleted) + { + /* + In this case we have deleted both temporary and + non-temporary tables, so: + - since we have deleted a non-temporary table we have to + binlog the statement, but + - since we have deleted a temporary table we cannot binlog + the statement (since the table has not been created on the + slave, this might cause the slave to stop). + + Instead, we write a built statement, only containing the + non-temporary tables, to the binary log + */ + built_query.chop(); // Chop of the last comma + built_query.append(" /* generated by server */"); + write_bin_log(thd, !error, built_query.ptr(), built_query.length()); + } + /* + The remaining cases are: + - no tables where deleted and + - only temporary tables where deleted and row-based + replication is used. + In both these cases, nothing should be written to the binary + log. + */ } } @@ -1837,13 +1909,17 @@ bool mysql_create_table(THD *thd,const char *db, const char *table_name, } thd->tmp_table_used= 1; } - if (!internal_tmp_table && mysql_bin_log.is_open()) - { - thd->clear_error(); - Query_log_event qinfo(thd, thd->query, thd->query_length, FALSE, FALSE); - mysql_bin_log.write(&qinfo); - } + /* + Don't write statement if: + - It is an internal temporary table, + - Row-based logging is used and it we are creating a temporary table, or + - The binary log is not open. + */ + if (!internal_tmp_table && + !(binlog_row_based && + (create_info->options & HA_LEX_CREATE_TMP_TABLE))) + write_bin_log(thd, TRUE, thd->query, thd->query_length); error= FALSE; unlock_and_end: VOID(pthread_mutex_unlock(&LOCK_open)); @@ -2982,8 +3058,63 @@ bool mysql_create_like_table(THD* thd, TABLE_LIST* table, goto err; /* purecov: inspected */ } - // Must be written before unlock - write_bin_log(thd, TRUE); + /* + We have to write the query before we unlock the tables. + */ + if (binlog_row_based) + { + /* + Since temporary tables are not replicated under row-based + replication, CREATE TABLE ... LIKE ... needs special + treatement. We have four cases to consider, according to the + following decision table: + + ==== ========= ========= ============================== + Case Target Source Write to binary log + ==== ========= ========= ============================== + 1 normal normal Original statement + 2 normal temporary Generated statement + 3 temporary normal Nothing + 4 temporary temporary Nothing + ==== ========= ========= ============================== + + The variable 'tmp_table' below is used to see if the source + table is a temporary table: if it is set, then the source table + was a temporary table and we can take apropriate actions. + */ + if (!(create_info->options & HA_LEX_CREATE_TMP_TABLE)) + { + if (tmp_table) // Case 2 + { + char buf[2048]; + String query(buf, sizeof(buf), system_charset_info); + query.length(0); // Have to zero it since constructor doesn't + TABLE *table_ptr; + int error; + + /* + Let's open and lock the table: it will be closed (and + unlocked) by close_thread_tables() at the end of the + statement anyway. + */ + if (!(table_ptr= open_ltable(thd, table, TL_READ_NO_INSERT))) + goto err; + + int result= store_create_info(thd, table, &query, create_info); + + DBUG_ASSERT(result == 0); // store_create_info() always return 0 + write_bin_log(thd, TRUE, query.ptr(), query.length()); + } + else // Case 1 + write_bin_log(thd, TRUE, thd->query, thd->query_length); + } + /* + Case 3 and 4 does nothing under RBR + */ + } + else if (!(create_info->options & HA_LEX_CREATE_TMP_TABLE)) + write_bin_log(thd, TRUE, thd->query, thd->query_length); + res= FALSE; goto err; @@ -3089,7 +3220,7 @@ mysql_discard_or_import_tablespace(THD *thd, error=1; if (error) goto err; - write_bin_log(thd, FALSE); + write_bin_log(thd, FALSE, thd->query, thd->query_length); err: close_thread_tables(thd); thd->tablespace_op=FALSE; @@ -4057,7 +4188,7 @@ bool mysql_alter_table(THD *thd,char *new_db, char *new_name, } if (!error) { - write_bin_log(thd, TRUE); + write_bin_log(thd, TRUE, thd->query, thd->query_length); if (do_send_ok) send_ok(thd); } @@ -4472,7 +4603,7 @@ bool mysql_alter_table(THD *thd,char *new_db, char *new_name, if (!error) { close_thread_tables(thd); - write_bin_log(thd, FALSE); + write_bin_log(thd, FALSE, thd->query, thd->query_length); send_ok(thd); DBUG_RETURN(FALSE); } @@ -4609,7 +4740,9 @@ bool mysql_alter_table(THD *thd,char *new_db, char *new_name, my_free((gptr) new_table,MYF(0)); goto err; } - write_bin_log(thd, TRUE); + /* We don't replicate alter table statement on temporary tables */ + if (!binlog_row_based) + write_bin_log(thd, TRUE, thd->query, thd->query_length); goto end_temporary; } @@ -4751,7 +4884,10 @@ bool mysql_alter_table(THD *thd,char *new_db, char *new_name, goto err; } thd->proc_info="end"; - write_bin_log(thd, TRUE); + + DBUG_ASSERT(!(mysql_bin_log.is_open() && binlog_row_based && + (create_info->options & HA_LEX_CREATE_TMP_TABLE))); + write_bin_log(thd, TRUE, thd->query, thd->query_length); VOID(pthread_cond_broadcast(&COND_refresh)); VOID(pthread_mutex_unlock(&LOCK_open)); /* @@ -4928,7 +5064,7 @@ copy_data_between_tables(TABLE *from,TABLE *to, { copy_ptr->do_copy(copy_ptr); } - if ((error=to->file->write_row((byte*) to->record[0]))) + if ((error=to->file->ha_write_row((byte*) to->record[0]))) { if ((!ignore && handle_duplicates != DUP_REPLACE) || diff --git a/sql/sql_udf.cc b/sql/sql_udf.cc index 367803f4c86..77bfba5ba28 100644 --- a/sql/sql_udf.cc +++ b/sql/sql_udf.cc @@ -455,7 +455,7 @@ int mysql_create_function(THD *thd,udf_func *udf) table->field[2]->store(u_d->dl,(uint) strlen(u_d->dl), system_charset_info); if (table->s->fields >= 4) // If not old func format table->field[3]->store((longlong) u_d->type, TRUE); - error = table->file->write_row(table->record[0]); + error = table->file->ha_write_row(table->record[0]); close_thread_tables(thd); if (error) @@ -514,7 +514,7 @@ int mysql_drop_function(THD *thd,const LEX_STRING *udf_name) HA_READ_KEY_EXACT)) { int error; - if ((error = table->file->delete_row(table->record[0]))) + if ((error = table->file->ha_delete_row(table->record[0]))) table->file->print_error(error, MYF(0)); } close_thread_tables(thd); diff --git a/sql/sql_union.cc b/sql/sql_union.cc index dee88af7d83..e80aaecfd64 100644 --- a/sql/sql_union.cc +++ b/sql/sql_union.cc @@ -62,7 +62,7 @@ bool select_union::send_data(List<Item> &values) if (thd->net.report_error) return 1; - if ((error= table->file->write_row(table->record[0]))) + if ((error= table->file->ha_write_row(table->record[0]))) { /* create_myisam_from_heap will generate error if needed */ if (error != HA_ERR_FOUND_DUPP_KEY && error != HA_ERR_FOUND_DUPP_UNIQUE && diff --git a/sql/sql_update.cc b/sql/sql_update.cc index 3aa6e7fc874..bd001cd9a06 100644 --- a/sql/sql_update.cc +++ b/sql/sql_update.cc @@ -355,6 +355,9 @@ int mysql_update(THD *thd, /* If quick select is used, initialize it before retrieving rows. */ if (select && select->quick && select->quick->reset()) goto err; + + table->file->try_semi_consistent_read(1); + if (used_index == MAX_KEY || (select && select->quick)) init_read_record(&info,thd,table,select,0,1); else @@ -367,6 +370,9 @@ int mysql_update(THD *thd, { if (!(select && select->skip_record())) { + if (table->file->was_semi_consistent_read()) + continue; /* repeat the read of the same row if it still exists */ + table->file->position(table->record[0]); if (my_b_write(&tempfile,table->file->ref, table->file->ref_length)) @@ -386,6 +392,7 @@ int mysql_update(THD *thd, if (thd->killed && !error) error= 1; // Aborted limit= tmp_limit; + table->file->try_semi_consistent_read(0); end_read_record(&info); /* Change select to use tempfile */ @@ -420,6 +427,7 @@ int mysql_update(THD *thd, if (select && select->quick && select->quick->reset()) goto err; + table->file->try_semi_consistent_read(1); init_read_record(&info,thd,table,select,0,1); updated= found= 0; @@ -435,10 +443,14 @@ int mysql_update(THD *thd, (MODE_STRICT_TRANS_TABLES | MODE_STRICT_ALL_TABLES))); will_batch= !table->file->start_bulk_update(); + while (!(error=info.read_record(&info)) && !thd->killed) { if (!(select && select->skip_record())) { + if (table->file->was_semi_consistent_read()) + continue; /* repeat the read of the same row if it still exists */ + store_record(table,record[1]); if (fill_record_n_invoke_before_triggers(thd, fields, values, 0, table->triggers, @@ -498,8 +510,8 @@ int mysql_update(THD *thd, else { /* Non-batched update */ - error= table->file->update_row((byte*) table->record[1], - (byte*) table->record[0]); + error= table->file->ha_update_row((byte*) table->record[1], + (byte*) table->record[0]); } if (!error) { @@ -594,6 +606,7 @@ int mysql_update(THD *thd, updated-= dup_key_found; if (will_batch) table->file->end_bulk_update(); + table->file->try_semi_consistent_read(0); end_read_record(&info); free_io_cache(table); // If ORDER BY delete select; @@ -624,10 +637,13 @@ int mysql_update(THD *thd, { if (error < 0) thd->clear_error(); - Query_log_event qinfo(thd, thd->query, thd->query_length, - transactional_table, FALSE); - if (mysql_bin_log.write(&qinfo) && transactional_table) + if (thd->binlog_query(THD::ROW_QUERY_TYPE, + thd->query, thd->query_length, + transactional_table, FALSE) && + transactional_table) + { error=1; // Rollback update + } } if (!transactional_table) thd->options|=OPTION_STATUS_NO_TRANS_UPDATE; @@ -1364,8 +1380,8 @@ bool multi_update::send_data(List<Item> ¬_used_values) */ main_table->file->extra(HA_EXTRA_PREPARE_FOR_UPDATE); } - if ((error=table->file->update_row(table->record[1], - table->record[0]))) + if ((error=table->file->ha_update_row(table->record[1], + table->record[0]))) { updated--; if (!ignore || error != HA_ERR_FOUND_DUPP_KEY) @@ -1400,7 +1416,7 @@ bool multi_update::send_data(List<Item> ¬_used_values) memcpy((char*) tmp_table->field[0]->ptr, (char*) table->file->ref, table->file->ref_length); /* Write row, ignoring duplicated updates to a row */ - if (error= tmp_table->file->write_row(tmp_table->record[0])) + if (error= tmp_table->file->ha_write_row(tmp_table->record[0])) { if (error != HA_ERR_FOUND_DUPP_KEY && error != HA_ERR_FOUND_DUPP_UNIQUE && @@ -1517,8 +1533,8 @@ int multi_update::do_updates(bool from_send_error) if (compare_record(table, thd->query_id)) { - if ((local_error=table->file->update_row(table->record[1], - table->record[0]))) + if ((local_error=table->file->ha_update_row(table->record[1], + table->record[0]))) { if (!ignore || local_error != HA_ERR_FOUND_DUPP_KEY) goto err; @@ -1597,10 +1613,13 @@ bool multi_update::send_eof() { if (local_error == 0) thd->clear_error(); - Query_log_event qinfo(thd, thd->query, thd->query_length, - transactional_tables, FALSE); - if (mysql_bin_log.write(&qinfo) && trans_safe) + if (thd->binlog_query(THD::ROW_QUERY_TYPE, + thd->query, thd->query_length, + transactional_tables, FALSE) && + trans_safe) + { local_error= 1; // Rollback update + } } if (!transactional_tables) thd->options|=OPTION_STATUS_NO_TRANS_UPDATE; diff --git a/sql/sql_yacc.yy b/sql/sql_yacc.yy index 4518f9e8de1..bc41178e1be 100644 --- a/sql/sql_yacc.yy +++ b/sql/sql_yacc.yy @@ -897,6 +897,7 @@ statement: alter | analyze | backup + | binlog_base64_event | call | change | check @@ -4400,6 +4401,13 @@ analyze: {} ; +binlog_base64_event: + BINLOG_SYM TEXT_STRING_sys + { + Lex->sql_command = SQLCOM_BINLOG_BASE64_EVENT; + Lex->comment= $2; + } + check: CHECK_SYM table_or_tables { diff --git a/sql/table.cc b/sql/table.cc index f9c6344e88f..bf208918346 100644 --- a/sql/table.cc +++ b/sql/table.cc @@ -1206,6 +1206,7 @@ int open_table_from_share(THD *thd, TABLE_SHARE *share, const char *alias, outparam->in_use= thd; outparam->s= share; outparam->db_stat= db_stat; + outparam->write_row_record= NULL; init_sql_alloc(&outparam->mem_root, TABLE_ALLOC_BLOCK_SIZE, 0); *root_ptr= &outparam->mem_root; @@ -1396,6 +1397,25 @@ int open_table_from_share(THD *thd, TABLE_SHARE *share, const char *alias, *root_ptr= old_root; thd->status_var.opened_tables++; +#ifdef HAVE_REPLICATION + + /* + This constant is used to mark that no table map version has been + assigned. No arithmetic is done on the value: it will be + overwritten with a value taken from MYSQL_BIN_LOG. + */ + share->table_map_version= ~(ulonglong)0; + + /* + Since openfrm() can be called without any locking (for example, + ha_create_table... functions), we do not assign a table map id + here. Instead we assign a value that is not used elsewhere, and + then assign a table map id inside open_table() under the + protection of the LOCK_open mutex. + */ + share->table_map_id= ULONG_MAX; +#endif + DBUG_RETURN (0); err: diff --git a/sql/table.h b/sql/table.h index d81eb6afe91..c8b9db1b2a5 100644 --- a/sql/table.h +++ b/sql/table.h @@ -189,7 +189,8 @@ typedef struct st_table_share bool is_view; bool name_lock, replace_with_name_lock; bool waiting_on_cond; /* Protection against free */ - + ulong table_map_id; /* for row-based replication */ + ulonglong table_map_version; /* TRUE if this is a system table like 'mysql.proc', which we want to be able to open and lock even when we already have some tables open and @@ -220,6 +221,8 @@ struct st_table { Field **field; /* Pointer to fields */ byte *record[2]; /* Pointer to records */ + byte *write_row_record; /* Used as optimisation in + THD::write_row */ byte *insert_values; /* used by INSERT ... UPDATE */ key_map quick_keys, used_keys, keys_in_use_for_query; KEY *key_info; /* data of keys in database */ |