diff options
author | Sergei Golubchik <sergii@pisem.net> | 2011-07-18 23:04:24 +0200 |
---|---|---|
committer | Sergei Golubchik <sergii@pisem.net> | 2011-07-18 23:04:24 +0200 |
commit | 4e46d8e5bff140f2549841167dc4b65a3c0a645d (patch) | |
tree | c6612dcc1d0fbd801c084e6c36307d9e5913a293 /storage/xtradb/ibuf/ibuf0ibuf.c | |
parent | 9a02c69f5c6766eaf552284a2a4dd0f1d26ecd2c (diff) | |
parent | d4d7a8fa62c406be73f6c0f6d75e795293db548b (diff) | |
download | mariadb-git-4e46d8e5bff140f2549841167dc4b65a3c0a645d.tar.gz |
merge with xtradb-5.5.15
fix test cases
Diffstat (limited to 'storage/xtradb/ibuf/ibuf0ibuf.c')
-rw-r--r-- | storage/xtradb/ibuf/ibuf0ibuf.c | 2624 |
1 files changed, 1981 insertions, 643 deletions
diff --git a/storage/xtradb/ibuf/ibuf0ibuf.c b/storage/xtradb/ibuf/ibuf0ibuf.c index e01c2d6b800..f5ed301dcf1 100644 --- a/storage/xtradb/ibuf/ibuf0ibuf.c +++ b/storage/xtradb/ibuf/ibuf0ibuf.c @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 1997, 2010, Innobase Oy. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -44,17 +44,18 @@ Created 7/19/1997 Heikki Tuuri #include "fsp0fsp.h" #include "trx0sys.h" #include "fil0fil.h" -#include "thr0loc.h" #include "rem0rec.h" #include "btr0cur.h" #include "btr0pcur.h" #include "btr0btr.h" +#include "row0upd.h" #include "sync0sync.h" #include "dict0boot.h" #include "fut0lst.h" #include "lock0lock.h" #include "log0recv.h" #include "que0que.h" +#include "srv0start.h" /* srv_shutdown_state */ /* STRUCTURE OF AN INSERT BUFFER RECORD @@ -89,7 +90,28 @@ is in the compact format. The presence of this marker can be detected by looking at the length of the field modulo DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE. The high-order bit of the character set field in the type info is the -"nullable" flag for the field. */ +"nullable" flag for the field. + +In versions >= 5.5: + +The optional marker byte at the start of the fourth field is replaced by +mandatory 3 fields, totaling 4 bytes: + + 1. 2 bytes: Counter field, used to sort records within a (space id, page + no) in the order they were added. This is needed so that for example the + sequence of operations "INSERT x, DEL MARK x, INSERT x" is handled + correctly. + + 2. 1 byte: Operation type (see ibuf_op_t). + + 3. 1 byte: Flags. Currently only one flag exists, IBUF_REC_COMPACT. + +To ensure older records, which do not have counters to enforce correct +sorting, are merged before any new records, ibuf_insert checks if we're +trying to insert to a position that contains old-style records, and if so, +refuses the insert. Thus, ibuf pages are gradually converted to the new +format as their corresponding buffer pool pages are read into memory. +*/ /* PREVENTING DEADLOCKS IN THE INSERT BUFFER SYSTEM @@ -168,7 +190,12 @@ access order rules. */ #define IBUF_TABLE_NAME "SYS_IBUF_TABLE" /** Operations that can currently be buffered. */ -UNIV_INTERN ibuf_use_t ibuf_use = IBUF_USE_INSERT; +UNIV_INTERN ibuf_use_t ibuf_use = IBUF_USE_ALL; + +#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG +/** Flag to control insert buffer debugging. */ +UNIV_INTERN uint ibuf_debug; +#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */ /** The insert buffer control structure */ UNIV_INTERN ibuf_t* ibuf = NULL; @@ -176,6 +203,12 @@ UNIV_INTERN ibuf_t* ibuf = NULL; /** Counter for ibuf_should_try() */ UNIV_INTERN ulint ibuf_flush_count = 0; +#ifdef UNIV_PFS_MUTEX +UNIV_INTERN mysql_pfs_key_t ibuf_pessimistic_insert_mutex_key; +UNIV_INTERN mysql_pfs_key_t ibuf_mutex_key; +UNIV_INTERN mysql_pfs_key_t ibuf_bitmap_mutex_key; +#endif /* UNIV_PFS_MUTEX */ + #ifdef UNIV_IBUF_COUNT_DEBUG /** Number of tablespaces in the ibuf_counts array */ #define IBUF_COUNT_N_SPACES 4 @@ -221,6 +254,32 @@ ibuf_count_check( list of the ibuf */ /* @} */ +/* Various constants for checking the type of an ibuf record and extracting +data from it. For details, see the description of the record format at the +top of this file. */ + +/** @name Format of the fourth column of an insert buffer record +The fourth column in the MySQL 5.5 format contains an operation +type, counter, and some flags. */ +/* @{ */ +#define IBUF_REC_INFO_SIZE 4 /*!< Combined size of info fields at + the beginning of the fourth field */ +#if IBUF_REC_INFO_SIZE >= DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE +# error "IBUF_REC_INFO_SIZE >= DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE" +#endif + +/* Offsets for the fields at the beginning of the fourth field */ +#define IBUF_REC_OFFSET_COUNTER 0 /*!< Operation counter */ +#define IBUF_REC_OFFSET_TYPE 2 /*!< Type of operation */ +#define IBUF_REC_OFFSET_FLAGS 3 /*!< Additional flags */ + +/* Record flag masks */ +#define IBUF_REC_COMPACT 0x1 /*!< Set in + IBUF_REC_OFFSET_FLAGS if the + user index is in COMPACT + format or later */ + + /** The mutex used to block pessimistic inserts to ibuf trees */ static mutex_t ibuf_pessimistic_insert_mutex; @@ -264,52 +323,43 @@ still physically like the index page even if the index would have been dropped! So, there seems to be no problem. */ /******************************************************************//** -Sets the flag in the current OS thread local storage denoting that it is +Sets the flag in the current mini-transaction record indicating we're inside an insert buffer routine. */ UNIV_INLINE void -ibuf_enter(void) -/*============*/ +ibuf_enter( +/*=======*/ + mtr_t* mtr) /*!< in/out: mini-transaction */ { - ibool* ptr; - - ptr = thr_local_get_in_ibuf_field(); - - ut_ad(*ptr == FALSE); - - *ptr = TRUE; + ut_ad(!mtr->inside_ibuf); + mtr->inside_ibuf = TRUE; } /******************************************************************//** -Sets the flag in the current OS thread local storage denoting that it is +Sets the flag in the current mini-transaction record indicating we're exiting an insert buffer routine. */ UNIV_INLINE void -ibuf_exit(void) -/*===========*/ +ibuf_exit( +/*======*/ + mtr_t* mtr) /*!< in/out: mini-transaction */ { - ibool* ptr; - - ptr = thr_local_get_in_ibuf_field(); - - ut_ad(*ptr == TRUE); - - *ptr = FALSE; + ut_ad(mtr->inside_ibuf); + mtr->inside_ibuf = FALSE; } -/******************************************************************//** -Returns TRUE if the current OS thread is performing an insert buffer -routine. - -For instance, a read-ahead of non-ibuf pages is forbidden by threads -that are executing an insert buffer routine. -@return TRUE if inside an insert buffer routine */ -UNIV_INTERN -ibool -ibuf_inside(void) -/*=============*/ +/**************************************************************//** +Commits an insert buffer mini-transaction and sets the persistent +cursor latch mode to BTR_NO_LATCHES, that is, detaches the cursor. */ +UNIV_INLINE +void +ibuf_btr_pcur_commit_specify_mtr( +/*=============================*/ + btr_pcur_t* pcur, /*!< in/out: persistent cursor */ + mtr_t* mtr) /*!< in/out: mini-transaction */ { - return(*thr_local_get_in_ibuf_field()); + ut_d(ibuf_exit(mtr)); + btr_pcur_commit_specify_mtr(pcur, mtr); } /******************************************************************//** @@ -319,11 +369,11 @@ static page_t* ibuf_header_page_get( /*=================*/ - mtr_t* mtr) /*!< in: mtr */ + mtr_t* mtr) /*!< in/out: mini-transaction */ { buf_block_t* block; - ut_ad(!ibuf_inside()); + ut_ad(!ibuf_inside(mtr)); block = buf_page_get( IBUF_SPACE_ID, 0, FSP_IBUF_HEADER_PAGE_NO, RW_X_LATCH, mtr); @@ -342,8 +392,10 @@ ibuf_tree_root_get( mtr_t* mtr) /*!< in: mtr */ { buf_block_t* block; + page_t* root; - ut_ad(ibuf_inside()); + ut_ad(ibuf_inside(mtr)); + ut_ad(mutex_own(&ibuf_mutex)); mtr_x_lock(dict_index_get_lock(ibuf->index), mtr); @@ -352,7 +404,13 @@ ibuf_tree_root_get( buf_block_dbg_add_level(block, SYNC_TREE_NODE); - return(buf_block_get_frame(block)); + root = buf_block_get_frame(block); + + ut_ad(page_get_space_id(root) == IBUF_SPACE_ID); + ut_ad(page_get_page_no(root) == FSP_IBUF_TREE_ROOT_PAGE_NO); + ut_ad(ibuf->empty == (page_get_n_recs(root) == 0)); + + return(root); } #ifdef UNIV_IBUF_COUNT_DEBUG @@ -411,6 +469,45 @@ ibuf_close(void) } /******************************************************************//** +Function to pass ibuf status variables */ +UNIV_INTERN +void +ibuf_export_ibuf_status( +/*====================*/ + ulint* size, + ulint* free_list, + ulint* segment_size, + ulint* merges, + ulint* merged_inserts, + ulint* merged_delete_marks, + ulint* merged_deletes, + ulint* discarded_inserts, + ulint* discarded_delete_marks, + ulint* discarded_deletes) +{ + *size + = ibuf->size; + *free_list + = ibuf->free_list_len; + *segment_size + = ibuf->seg_size; + *merges + = ibuf->n_merges; + *merged_inserts + = ibuf->n_merged_ops[IBUF_OP_INSERT]; + *merged_delete_marks + = ibuf->n_merged_ops[IBUF_OP_DELETE_MARK]; + *merged_deletes + = ibuf->n_merged_ops[IBUF_OP_DELETE]; + *discarded_inserts + = ibuf->n_discarded_ops[IBUF_OP_INSERT]; + *discarded_delete_marks + = ibuf->n_discarded_ops[IBUF_OP_DELETE_MARK]; + *discarded_deletes + = ibuf->n_discarded_ops[IBUF_OP_DELETE]; +} + +/******************************************************************//** Updates the size information of the ibuf, assuming the segment size has not changed. */ static @@ -429,8 +526,6 @@ ibuf_size_update( /* the '1 +' is the ibuf header page */ ibuf->size = ibuf->seg_size - (1 + ibuf->free_list_len); - - ibuf->empty = page_get_n_recs(root) == 0; } /******************************************************************//** @@ -463,12 +558,15 @@ ibuf_init_at_db_start(void) srv_ibuf_max_size = (long long) ibuf->max_size * UNIV_PAGE_SIZE; - mutex_create(&ibuf_pessimistic_insert_mutex, + mutex_create(ibuf_pessimistic_insert_mutex_key, + &ibuf_pessimistic_insert_mutex, SYNC_IBUF_PESS_INSERT_MUTEX); - mutex_create(&ibuf_mutex, SYNC_IBUF_MUTEX); + mutex_create(ibuf_mutex_key, + &ibuf_mutex, SYNC_IBUF_MUTEX); - mutex_create(&ibuf_bitmap_mutex, SYNC_IBUF_BITMAP_MUTEX); + mutex_create(ibuf_bitmap_mutex_key, + &ibuf_bitmap_mutex, SYNC_IBUF_BITMAP_MUTEX); mtr_start(&mtr); @@ -480,7 +578,7 @@ ibuf_init_at_db_start(void) fseg_n_reserved_pages(header_page + IBUF_HEADER + IBUF_TREE_SEG_HEADER, &n_used, &mtr); - ibuf_enter(); + ibuf_enter(&mtr); ut_ad(n_used >= 2); @@ -500,9 +598,8 @@ ibuf_init_at_db_start(void) ibuf_size_update(root, &mtr); mutex_exit(&ibuf_mutex); - mtr_commit(&mtr); - - ibuf_exit(); + ibuf->empty = (page_get_n_recs(root) == 0); + ibuf_mtr_commit(&mtr); heap = mem_heap_create(450); @@ -512,7 +609,7 @@ ibuf_init_at_db_start(void) dict_mem_table_add_col(table, heap, "DUMMY_COLUMN", DATA_BINARY, 0, 0); - table->id = ut_dulint_add(DICT_IBUF_ID_MIN, IBUF_SPACE_ID); + table->id = DICT_IBUF_ID_MIN + IBUF_SPACE_ID; dict_table_add_to_cache(table, heap); mem_heap_free(heap); @@ -523,7 +620,7 @@ ibuf_init_at_db_start(void) dict_mem_index_add_field(index, "DUMMY_COLUMN", 0); - index->id = ut_dulint_add(DICT_IBUF_ID_MIN, IBUF_SPACE_ID); + index->id = DICT_IBUF_ID_MIN + IBUF_SPACE_ID; error = dict_index_add_to_cache(table, index, FSP_IBUF_TREE_ROOT_PAGE_NO, FALSE); @@ -589,22 +686,49 @@ ibuf_parse_bitmap_init( return(ptr); } #ifndef UNIV_HOTBACKUP +# ifdef UNIV_DEBUG +/** Gets the desired bits for a given page from a bitmap page. +@param page in: bitmap page +@param offset in: page whose bits to get +@param zs in: compressed page size in bytes; 0 for uncompressed pages +@param bit in: IBUF_BITMAP_FREE, IBUF_BITMAP_BUFFERED, ... +@param mtr in: mini-transaction holding an x-latch on the bitmap page +@return value of bits */ +# define ibuf_bitmap_page_get_bits(page, offset, zs, bit, mtr) \ + ibuf_bitmap_page_get_bits_low(page, offset, zs, \ + MTR_MEMO_PAGE_X_FIX, mtr, bit) +# else /* UNIV_DEBUG */ +/** Gets the desired bits for a given page from a bitmap page. +@param page in: bitmap page +@param offset in: page whose bits to get +@param zs in: compressed page size in bytes; 0 for uncompressed pages +@param bit in: IBUF_BITMAP_FREE, IBUF_BITMAP_BUFFERED, ... +@param mtr in: mini-transaction holding an x-latch on the bitmap page +@return value of bits */ +# define ibuf_bitmap_page_get_bits(page, offset, zs, bit, mtr) \ + ibuf_bitmap_page_get_bits_low(page, offset, zs, bit) +# endif /* UNIV_DEBUG */ + /********************************************************************//** Gets the desired bits for a given page from a bitmap page. @return value of bits */ UNIV_INLINE ulint -ibuf_bitmap_page_get_bits( -/*======================*/ +ibuf_bitmap_page_get_bits_low( +/*==========================*/ const page_t* page, /*!< in: bitmap page */ ulint page_no,/*!< in: page whose bits to get */ ulint zip_size,/*!< in: compressed page size in bytes; 0 for uncompressed pages */ - ulint bit, /*!< in: IBUF_BITMAP_FREE, +#ifdef UNIV_DEBUG + ulint latch_type, + /*!< in: MTR_MEMO_PAGE_X_FIX, + MTR_MEMO_BUF_FIX, ... */ + mtr_t* mtr, /*!< in: mini-transaction holding latch_type + on the bitmap page */ +#endif /* UNIV_DEBUG */ + ulint bit) /*!< in: IBUF_BITMAP_FREE, IBUF_BITMAP_BUFFERED, ... */ - mtr_t* mtr __attribute__((unused))) - /*!< in: mtr containing an - x-latch to the bitmap page */ { ulint byte_offset; ulint bit_offset; @@ -616,7 +740,7 @@ ibuf_bitmap_page_get_bits( # error "IBUF_BITS_PER_PAGE % 2 != 0" #endif ut_ad(ut_is_2pow(zip_size)); - ut_ad(mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_X_FIX)); + ut_ad(mtr_memo_contains_page(mtr, page, latch_type)); if (!zip_size) { bit_offset = (page_no % UNIV_PAGE_SIZE) * IBUF_BITS_PER_PAGE @@ -1042,21 +1166,29 @@ Must not be called when recv_no_ibuf_operations==TRUE. @return TRUE if level 2 or level 3 page */ UNIV_INTERN ibool -ibuf_page( -/*======*/ - ulint space, /*!< in: space id */ - ulint zip_size,/*!< in: compressed page size in bytes, or 0 */ - ulint page_no,/*!< in: page number */ - mtr_t* mtr) /*!< in: mtr which will contain an x-latch to the - bitmap page if the page is not one of the fixed - address ibuf pages, or NULL, in which case a new - transaction is created. */ +ibuf_page_low( +/*==========*/ + ulint space, /*!< in: space id */ + ulint zip_size,/*!< in: compressed page size in bytes, or 0 */ + ulint page_no,/*!< in: page number */ +#ifdef UNIV_DEBUG + ibool x_latch,/*!< in: FALSE if relaxed check + (avoid latching the bitmap page) */ +#endif /* UNIV_DEBUG */ + const char* file, /*!< in: file name */ + ulint line, /*!< in: line where called */ + mtr_t* mtr) /*!< in: mtr which will contain an + x-latch to the bitmap page if the page + is not one of the fixed address ibuf + pages, or NULL, in which case a new + transaction is created. */ { ibool ret; mtr_t local_mtr; page_t* bitmap_page; ut_ad(!recv_no_ibuf_operations); + ut_ad(x_latch || mtr == NULL); if (ibuf_fixed_addr_page(space, zip_size, page_no)) { @@ -1068,12 +1200,44 @@ ibuf_page( ut_ad(fil_space_get_type(IBUF_SPACE_ID) == FIL_TABLESPACE); +#ifdef UNIV_DEBUG + if (!x_latch) { + mtr_start(&local_mtr); + + /* Get the bitmap page without a page latch, so that + we will not be violating the latching order when + another bitmap page has already been latched by this + thread. The page will be buffer-fixed, and thus it + cannot be removed or relocated while we are looking at + it. The contents of the page could change, but the + IBUF_BITMAP_IBUF bit that we are interested in should + not be modified by any other thread. Nobody should be + calling ibuf_add_free_page() or ibuf_remove_free_page() + while the page is linked to the insert buffer b-tree. */ + + bitmap_page = buf_block_get_frame( + buf_page_get_gen( + space, zip_size, + ibuf_bitmap_page_no_calc(zip_size, page_no), + RW_NO_LATCH, NULL, BUF_GET_NO_LATCH, + file, line, &local_mtr)); + + ret = ibuf_bitmap_page_get_bits_low( + bitmap_page, page_no, zip_size, + MTR_MEMO_BUF_FIX, &local_mtr, IBUF_BITMAP_IBUF); + + mtr_commit(&local_mtr); + return(ret); + } +#endif /* UNIV_DEBUG */ + if (mtr == NULL) { mtr = &local_mtr; mtr_start(mtr); } - bitmap_page = ibuf_bitmap_get_map_page(space, page_no, zip_size, mtr); + bitmap_page = ibuf_bitmap_get_map_page_func(space, page_no, zip_size, + file, line, mtr); ret = ibuf_bitmap_page_get_bits(bitmap_page, page_no, zip_size, IBUF_BITMAP_IBUF, mtr); @@ -1085,19 +1249,30 @@ ibuf_page( return(ret); } +#ifdef UNIV_DEBUG +# define ibuf_rec_get_page_no(mtr,rec) ibuf_rec_get_page_no_func(mtr,rec) +#else /* UNIV_DEBUG */ +# define ibuf_rec_get_page_no(mtr,rec) ibuf_rec_get_page_no_func(rec) +#endif /* UNIV_DEBUG */ + /********************************************************************//** Returns the page number field of an ibuf record. @return page number */ static ulint -ibuf_rec_get_page_no( -/*=================*/ +ibuf_rec_get_page_no_func( +/*======================*/ +#ifdef UNIV_DEBUG + mtr_t* mtr, /*!< in: mini-transaction owning rec */ +#endif /* UNIV_DEBUG */ const rec_t* rec) /*!< in: ibuf record */ { const byte* field; ulint len; - ut_ad(ibuf_inside()); + ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX) + || mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_S_FIX)); + ut_ad(ibuf_inside(mtr)); ut_ad(rec_get_n_fields_old(rec) > 2); field = rec_get_nth_field_old(rec, 1, &len); @@ -1119,20 +1294,31 @@ ibuf_rec_get_page_no( return(mach_read_from_4(field)); } +#ifdef UNIV_DEBUG +# define ibuf_rec_get_space(mtr,rec) ibuf_rec_get_space_func(mtr,rec) +#else /* UNIV_DEBUG */ +# define ibuf_rec_get_space(mtr,rec) ibuf_rec_get_space_func(rec) +#endif /* UNIV_DEBUG */ + /********************************************************************//** Returns the space id field of an ibuf record. For < 4.1.x format records returns 0. @return space id */ static ulint -ibuf_rec_get_space( -/*===============*/ +ibuf_rec_get_space_func( +/*====================*/ +#ifdef UNIV_DEBUG + mtr_t* mtr, /*!< in: mini-transaction owning rec */ +#endif /* UNIV_DEBUG */ const rec_t* rec) /*!< in: ibuf record */ { const byte* field; ulint len; - ut_ad(ibuf_inside()); + ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX) + || mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_S_FIX)); + ut_ad(ibuf_inside(mtr)); ut_ad(rec_get_n_fields_old(rec) > 2); field = rec_get_nth_field_old(rec, 1, &len); @@ -1153,9 +1339,215 @@ ibuf_rec_get_space( return(0); } +#ifdef UNIV_DEBUG +# define ibuf_rec_get_info(mtr,rec,op,comp,info_len,counter) \ + ibuf_rec_get_info_func(mtr,rec,op,comp,info_len,counter) +#else /* UNIV_DEBUG */ +# define ibuf_rec_get_info(mtr,rec,op,comp,info_len,counter) \ + ibuf_rec_get_info_func(rec,op,comp,info_len,counter) +#endif +/****************************************************************//** +Get various information about an ibuf record in >= 4.1.x format. */ +static +void +ibuf_rec_get_info_func( +/*===================*/ +#ifdef UNIV_DEBUG + mtr_t* mtr, /*!< in: mini-transaction owning rec */ +#endif /* UNIV_DEBUG */ + const rec_t* rec, /*!< in: ibuf record */ + ibuf_op_t* op, /*!< out: operation type, or NULL */ + ibool* comp, /*!< out: compact flag, or NULL */ + ulint* info_len, /*!< out: length of info fields at the + start of the fourth field, or + NULL */ + ulint* counter) /*!< in: counter value, or NULL */ +{ + const byte* types; + ulint fields; + ulint len; + + /* Local variables to shadow arguments. */ + ibuf_op_t op_local; + ibool comp_local; + ulint info_len_local; + ulint counter_local; + + ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX) + || mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_S_FIX)); + ut_ad(ibuf_inside(mtr)); + fields = rec_get_n_fields_old(rec); + ut_a(fields > 4); + + types = rec_get_nth_field_old(rec, 3, &len); + + info_len_local = len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE; + + switch (info_len_local) { + case 0: + case 1: + op_local = IBUF_OP_INSERT; + comp_local = info_len_local; + ut_ad(!counter); + counter_local = ULINT_UNDEFINED; + break; + + case IBUF_REC_INFO_SIZE: + op_local = (ibuf_op_t)types[IBUF_REC_OFFSET_TYPE]; + comp_local = types[IBUF_REC_OFFSET_FLAGS] & IBUF_REC_COMPACT; + counter_local = mach_read_from_2( + types + IBUF_REC_OFFSET_COUNTER); + break; + + default: + ut_error; + } + + ut_a(op_local < IBUF_OP_COUNT); + ut_a((len - info_len_local) == + (fields - 4) * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); + + if (op) { + *op = op_local; + } + + if (comp) { + *comp = comp_local; + } + + if (info_len) { + *info_len = info_len_local; + } + + if (counter) { + *counter = counter_local; + } +} + +#ifdef UNIV_DEBUG +# define ibuf_rec_get_op_type(mtr,rec) ibuf_rec_get_op_type_func(mtr,rec) +#else /* UNIV_DEBUG */ +# define ibuf_rec_get_op_type(mtr,rec) ibuf_rec_get_op_type_func(rec) +#endif + +/****************************************************************//** +Returns the operation type field of an ibuf record. +@return operation type */ +static +ibuf_op_t +ibuf_rec_get_op_type_func( +/*======================*/ +#ifdef UNIV_DEBUG + mtr_t* mtr, /*!< in: mini-transaction owning rec */ +#endif /* UNIV_DEBUG */ + const rec_t* rec) /*!< in: ibuf record */ +{ + ulint len; + + ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX) + || mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_S_FIX)); + ut_ad(ibuf_inside(mtr)); + ut_ad(rec_get_n_fields_old(rec) > 2); + + (void) rec_get_nth_field_old(rec, 1, &len); + + if (len > 1) { + /* This is a < 4.1.x format record */ + + return(IBUF_OP_INSERT); + } else { + ibuf_op_t op; + + ibuf_rec_get_info(mtr, rec, &op, NULL, NULL, NULL); + + return(op); + } +} + +/****************************************************************//** +Read the first two bytes from a record's fourth field (counter field in new +records; something else in older records). +@return "counter" field, or ULINT_UNDEFINED if for some reason it +can't be read */ +UNIV_INTERN +ulint +ibuf_rec_get_counter( +/*=================*/ + const rec_t* rec) /*!< in: ibuf record */ +{ + const byte* ptr; + ulint len; + + if (rec_get_n_fields_old(rec) < 4) { + + return(ULINT_UNDEFINED); + } + + ptr = rec_get_nth_field_old(rec, 3, &len); + + if (len >= 2) { + + return(mach_read_from_2(ptr)); + } else { + + return(ULINT_UNDEFINED); + } +} + +/****************************************************************//** +Add accumulated operation counts to a permanent array. Both arrays must be +of size IBUF_OP_COUNT. */ +static +void +ibuf_add_ops( +/*=========*/ + ulint* arr, /*!< in/out: array to modify */ + const ulint* ops) /*!< in: operation counts */ + +{ + ulint i; + +#ifndef HAVE_ATOMIC_BUILTINS + ut_ad(mutex_own(&ibuf_mutex)); +#endif /* !HAVE_ATOMIC_BUILTINS */ + + for (i = 0; i < IBUF_OP_COUNT; i++) { +#ifdef HAVE_ATOMIC_BUILTINS + os_atomic_increment_ulint(&arr[i], ops[i]); +#else /* HAVE_ATOMIC_BUILTINS */ + arr[i] += ops[i]; +#endif /* HAVE_ATOMIC_BUILTINS */ + } +} + +/****************************************************************//** +Print operation counts. The array must be of size IBUF_OP_COUNT. */ +static +void +ibuf_print_ops( +/*===========*/ + const ulint* ops, /*!< in: operation counts */ + FILE* file) /*!< in: file where to print */ +{ + static const char* op_names[] = { + "insert", + "delete mark", + "delete" + }; + ulint i; + + ut_a(UT_ARR_SIZE(op_names) == IBUF_OP_COUNT); + + for (i = 0; i < IBUF_OP_COUNT; i++) { + fprintf(file, "%s %lu%s", op_names[i], + (ulong) ops[i], (i < (IBUF_OP_COUNT - 1)) ? ", " : ""); + } + + putc('\n', file); +} + /********************************************************************//** Creates a dummy index for inserting a record to a non-clustered index. - @return dummy index */ static dict_index_t* @@ -1265,9 +1657,25 @@ ibuf_build_entry_pre_4_1_x( return(tuple); } +#ifdef UNIV_DEBUG +# define ibuf_build_entry_from_ibuf_rec(mtr,ibuf_rec,heap,pindex) \ + ibuf_build_entry_from_ibuf_rec_func(mtr,ibuf_rec,heap,pindex) +#else /* UNIV_DEBUG */ +# define ibuf_build_entry_from_ibuf_rec(mtr,ibuf_rec,heap,pindex) \ + ibuf_build_entry_from_ibuf_rec_func(ibuf_rec,heap,pindex) +#endif + /*********************************************************************//** -Builds the entry to insert into a non-clustered index when we have the -corresponding record in an ibuf index. +Builds the entry used to + +1) IBUF_OP_INSERT: insert into a non-clustered index + +2) IBUF_OP_DELETE_MARK: find the record whose delete-mark flag we need to + activate + +3) IBUF_OP_DELETE: find the record we need to delete + +when we have the corresponding record in an ibuf index. NOTE that as we copy pointers to fields in ibuf_rec, the caller must hold a latch to the ibuf_rec page as long as the entry is used! @@ -1275,8 +1683,11 @@ hold a latch to the ibuf_rec page as long as the entry is used! @return own: entry to insert to a non-clustered index */ static dtuple_t* -ibuf_build_entry_from_ibuf_rec( -/*===========================*/ +ibuf_build_entry_from_ibuf_rec_func( +/*================================*/ +#ifdef UNIV_DEBUG + mtr_t* mtr, /*!< in: mini-transaction owning rec */ +#endif /* UNIV_DEBUG */ const rec_t* ibuf_rec, /*!< in: record in an insert buffer */ mem_heap_t* heap, /*!< in: heap where built */ dict_index_t** pindex) /*!< out, own: dummy index that @@ -1288,9 +1699,15 @@ ibuf_build_entry_from_ibuf_rec( const byte* types; const byte* data; ulint len; + ulint info_len; ulint i; + ulint comp; dict_index_t* index; + ut_ad(mtr_memo_contains_page(mtr, ibuf_rec, MTR_MEMO_PAGE_X_FIX) + || mtr_memo_contains_page(mtr, ibuf_rec, MTR_MEMO_PAGE_S_FIX)); + ut_ad(ibuf_inside(mtr)); + data = rec_get_nth_field_old(ibuf_rec, 1, &len); if (len > 1) { @@ -1311,16 +1728,12 @@ ibuf_build_entry_from_ibuf_rec( types = rec_get_nth_field_old(ibuf_rec, 3, &len); - ut_a(len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE <= 1); - index = ibuf_dummy_index_create( - n_fields, len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); + ibuf_rec_get_info(mtr, ibuf_rec, NULL, &comp, &info_len, NULL); - if (len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE) { - /* compact record format */ - len--; - ut_a(*types == 0); - types++; - } + index = ibuf_dummy_index_create(n_fields, comp); + + len -= info_len; + types += info_len; ut_a(len == n_fields * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); @@ -1350,6 +1763,64 @@ ibuf_build_entry_from_ibuf_rec( return(tuple); } +/******************************************************************//** +Get the data size. +@return size of fields */ +UNIV_INLINE +ulint +ibuf_rec_get_size( +/*==============*/ + const rec_t* rec, /*!< in: ibuf record */ + const byte* types, /*!< in: fields */ + ulint n_fields, /*!< in: number of fields */ + ibool pre_4_1, /*!< in: TRUE=pre-4.1 format, + FALSE=newer */ + ulint comp) /*!< in: 0=ROW_FORMAT=REDUNDANT, + nonzero=ROW_FORMAT=COMPACT */ +{ + ulint i; + ulint field_offset; + ulint types_offset; + ulint size = 0; + + if (pre_4_1) { + field_offset = 2; + types_offset = DATA_ORDER_NULL_TYPE_BUF_SIZE; + } else { + field_offset = 4; + types_offset = DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE; + } + + for (i = 0; i < n_fields; i++) { + ulint len; + dtype_t dtype; + + rec_get_nth_field_offs_old(rec, i + field_offset, &len); + + if (len != UNIV_SQL_NULL) { + size += len; + } else if (pre_4_1) { + dtype_read_for_order_and_null_size(&dtype, types); + + size += dtype_get_sql_null_size(&dtype, comp); + } else { + dtype_new_read_for_order_and_null_size(&dtype, types); + + size += dtype_get_sql_null_size(&dtype, comp); + } + + types += types_offset; + } + + return(size); +} + +#ifdef UNIV_DEBUG +# define ibuf_rec_get_volume(mtr,rec) ibuf_rec_get_volume_func(mtr,rec) +#else /* UNIV_DEBUG */ +# define ibuf_rec_get_volume(mtr,rec) ibuf_rec_get_volume_func(rec) +#endif + /********************************************************************//** Returns the space taken by a stored non-clustered index entry if converted to an index record. @@ -1357,26 +1828,30 @@ an index record. taken in the page directory */ static ulint -ibuf_rec_get_volume( -/*================*/ +ibuf_rec_get_volume_func( +/*=====================*/ +#ifdef UNIV_DEBUG + mtr_t* mtr, /*!< in: mini-transaction owning rec */ +#endif /* UNIV_DEBUG */ const rec_t* ibuf_rec)/*!< in: ibuf record */ { - dtype_t dtype; - ibool new_format = FALSE; - ulint data_size = 0; - ulint n_fields; - const byte* types; - const byte* data; ulint len; - ulint i; + const byte* data; + const byte* types; + ulint n_fields; + ulint data_size; + ibool pre_4_1; ulint comp; - ut_ad(ibuf_inside()); + ut_ad(mtr_memo_contains_page(mtr, ibuf_rec, MTR_MEMO_PAGE_X_FIX) + || mtr_memo_contains_page(mtr, ibuf_rec, MTR_MEMO_PAGE_S_FIX)); + ut_ad(ibuf_inside(mtr)); ut_ad(rec_get_n_fields_old(ibuf_rec) > 2); data = rec_get_nth_field_old(ibuf_rec, 1, &len); + pre_4_1 = (len > 1); - if (len > 1) { + if (pre_4_1) { /* < 4.1.x format record */ ut_a(trx_doublewrite_must_reset_space_ids); @@ -1390,54 +1865,46 @@ ibuf_rec_get_volume( comp = 0; } else { /* >= 4.1.x format record */ + ibuf_op_t op; + ulint info_len; ut_a(trx_sys_multiple_tablespace_format); ut_a(*data == 0); types = rec_get_nth_field_old(ibuf_rec, 3, &len); - comp = len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE; + ibuf_rec_get_info(mtr, ibuf_rec, &op, &comp, &info_len, NULL); + + if (op == IBUF_OP_DELETE_MARK || op == IBUF_OP_DELETE) { + /* Delete-marking a record doesn't take any + additional space, and while deleting a record + actually frees up space, we have to play it safe and + pretend it takes no additional space (the record + might not exist, etc.). */ - ut_a(comp <= 1); - if (comp) { - /* compact record format */ + return(0); + } else if (comp) { + dtuple_t* entry; ulint volume; dict_index_t* dummy_index; mem_heap_t* heap = mem_heap_create(500); - dtuple_t* entry = ibuf_build_entry_from_ibuf_rec( - ibuf_rec, heap, &dummy_index); + + entry = ibuf_build_entry_from_ibuf_rec( + mtr, ibuf_rec, heap, &dummy_index); + volume = rec_get_converted_size(dummy_index, entry, 0); + ibuf_dummy_index_free(dummy_index); mem_heap_free(heap); + return(volume + page_dir_calc_reserved_space(1)); } + types += info_len; n_fields = rec_get_n_fields_old(ibuf_rec) - 4; - - new_format = TRUE; } - for (i = 0; i < n_fields; i++) { - if (new_format) { - data = rec_get_nth_field_old(ibuf_rec, i + 4, &len); - - dtype_new_read_for_order_and_null_size( - &dtype, types + i - * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); - } else { - data = rec_get_nth_field_old(ibuf_rec, i + 2, &len); - - dtype_read_for_order_and_null_size( - &dtype, types + i - * DATA_ORDER_NULL_TYPE_BUF_SIZE); - } - - if (len == UNIV_SQL_NULL) { - data_size += dtype_get_sql_null_size(&dtype, comp); - } else { - data_size += len; - } - } + data_size = ibuf_rec_get_size(ibuf_rec, types, n_fields, pre_4_1, comp); return(data_size + rec_get_converted_extra_size(data_size, n_fields, 0) + page_dir_calc_reserved_space(1)); @@ -1455,11 +1922,14 @@ static dtuple_t* ibuf_entry_build( /*=============*/ + ibuf_op_t op, /*!< in: operation type */ dict_index_t* index, /*!< in: non-clustered index */ const dtuple_t* entry, /*!< in: entry for a non-clustered index */ ulint space, /*!< in: space id */ ulint page_no,/*!< in: index page number where entry should be inserted */ + ulint counter,/*!< in: counter value; + ULINT_UNDEFINED=not used */ mem_heap_t* heap) /*!< in: heap into which to build */ { dtuple_t* tuple; @@ -1467,28 +1937,28 @@ ibuf_entry_build( const dfield_t* entry_field; ulint n_fields; byte* buf; - byte* buf2; + byte* ti; + byte* type_info; ulint i; - /* Starting from 4.1.x, we have to build a tuple whose - (1) first field is the space id, - (2) the second field a single marker byte (0) to tell that this - is a new format record, - (3) the third contains the page number, and - (4) the fourth contains the relevent type information of each data - field; the length of this field % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE is - (a) 0 for b-trees in the old format, and - (b) 1 for b-trees in the compact format, the first byte of the field - being the marker (0); - (5) and the rest of the fields are copied from entry. All fields - in the tuple are ordered like the type binary in our insert buffer - tree. */ + ut_ad(counter != ULINT_UNDEFINED || op == IBUF_OP_INSERT); + ut_ad(counter == ULINT_UNDEFINED || counter <= 0xFFFF); + ut_ad(op < IBUF_OP_COUNT); + + /* We have to build a tuple with the following fields: + + 1-4) These are described at the top of this file. + + 5) The rest of the fields are copied from the entry. + + All fields in the tuple are ordered like the type binary in our + insert buffer tree. */ n_fields = dtuple_get_n_fields(entry); tuple = dtuple_create(heap, n_fields + 4); - /* Store the space id in tuple */ + /* 1) Space Id */ field = dtuple_get_nth_field(tuple, 0); @@ -1498,7 +1968,7 @@ ibuf_entry_build( dfield_set_data(field, buf, 4); - /* Store the marker byte field in tuple */ + /* 2) Marker byte */ field = dtuple_get_nth_field(tuple, 1); @@ -1510,7 +1980,7 @@ ibuf_entry_build( dfield_set_data(field, buf, 1); - /* Store the page number in tuple */ + /* 3) Page number */ field = dtuple_get_nth_field(tuple, 2); @@ -1520,14 +1990,42 @@ ibuf_entry_build( dfield_set_data(field, buf, 4); - /* Store the type info in buf2, and add the fields from entry to - tuple */ - buf2 = mem_heap_alloc(heap, n_fields - * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE - + dict_table_is_comp(index->table)); - if (dict_table_is_comp(index->table)) { - *buf2++ = 0; /* write the compact format indicator */ + /* 4) Type info, part #1 */ + + if (counter == ULINT_UNDEFINED) { + i = dict_table_is_comp(index->table) ? 1 : 0; + } else { + ut_ad(counter <= 0xFFFF); + i = IBUF_REC_INFO_SIZE; + } + + ti = type_info = mem_heap_alloc(heap, i + n_fields + * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); + + switch (i) { + default: + ut_error; + break; + case 1: + /* set the flag for ROW_FORMAT=COMPACT */ + *ti++ = 0; + /* fall through */ + case 0: + /* the old format does not allow delete buffering */ + ut_ad(op == IBUF_OP_INSERT); + break; + case IBUF_REC_INFO_SIZE: + mach_write_to_2(ti + IBUF_REC_OFFSET_COUNTER, counter); + + ti[IBUF_REC_OFFSET_TYPE] = (byte) op; + ti[IBUF_REC_OFFSET_FLAGS] = dict_table_is_comp(index->table) + ? IBUF_REC_COMPACT : 0; + ti += IBUF_REC_INFO_SIZE; + break; } + + /* 5+) Fields from the entry */ + for (i = 0; i < n_fields; i++) { ulint fixed_len; const dict_field_t* ifield; @@ -1562,21 +2060,16 @@ ibuf_entry_build( #endif /* UNIV_DEBUG */ dtype_new_store_for_order_and_null_size( - buf2 + i * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE, - dfield_get_type(entry_field), fixed_len); + ti, dfield_get_type(entry_field), fixed_len); + ti += DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE; } - /* Store the type info in buf2 to field 3 of tuple */ + /* 4) Type info, part #2 */ field = dtuple_get_nth_field(tuple, 3); - if (dict_table_is_comp(index->table)) { - buf2--; - } + dfield_set_data(field, type_info, ti - type_info); - dfield_set_data(field, buf2, n_fields - * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE - + dict_table_is_comp(index->table)); /* Set all the types in the new tuple binary */ dtuple_set_types_binary(tuple, n_fields + 4); @@ -1713,9 +2206,9 @@ ibuf_data_too_much_free(void) /*********************************************************************//** Allocates a new page from the ibuf file segment and adds it to the free list. -@return DB_SUCCESS, or DB_STRONG_FAIL if no space left */ +@return TRUE on success, FALSE if no space left */ static -ulint +ibool ibuf_add_free_page(void) /*====================*/ { @@ -1751,25 +2244,19 @@ ibuf_add_free_page(void) header_page + IBUF_HEADER + IBUF_TREE_SEG_HEADER, 0, FSP_UP, &mtr); - if (page_no == FIL_NULL) { + if (UNIV_UNLIKELY(page_no == FIL_NULL)) { mtr_commit(&mtr); - return(DB_STRONG_FAIL); - } - - { - buf_block_t* block; - - block = buf_page_get( + return(FALSE); + } else { + buf_block_t* block = buf_page_get( IBUF_SPACE_ID, 0, page_no, RW_X_LATCH, &mtr); - buf_block_dbg_add_level(block, SYNC_TREE_NODE_NEW); - page = buf_block_get_frame(block); } - ibuf_enter(); + ibuf_enter(&mtr); mutex_enter(&ibuf_mutex); @@ -1792,16 +2279,14 @@ ibuf_add_free_page(void) bitmap_page = ibuf_bitmap_get_map_page( IBUF_SPACE_ID, page_no, zip_size, &mtr); + mutex_exit(&ibuf_mutex); + ibuf_bitmap_page_set_bits( bitmap_page, page_no, zip_size, IBUF_BITMAP_IBUF, TRUE, &mtr); - mtr_commit(&mtr); - - mutex_exit(&ibuf_mutex); - - ibuf_exit(); + ibuf_mtr_commit(&mtr); - return(DB_SUCCESS); + return(TRUE); } /*********************************************************************//** @@ -1831,29 +2316,26 @@ ibuf_remove_free_page(void) header_page = ibuf_header_page_get(&mtr); /* Prevent pessimistic inserts to insert buffer trees for a while */ + ibuf_enter(&mtr); mutex_enter(&ibuf_pessimistic_insert_mutex); - - ibuf_enter(); - mutex_enter(&ibuf_mutex); if (!ibuf_data_too_much_free()) { mutex_exit(&ibuf_mutex); - - ibuf_exit(); - mutex_exit(&ibuf_pessimistic_insert_mutex); - mtr_commit(&mtr); + ibuf_mtr_commit(&mtr); return; } - mtr_start(&mtr2); + ibuf_mtr_start(&mtr2); root = ibuf_tree_root_get(&mtr2); + mutex_exit(&ibuf_mutex); + page_no = flst_get_last(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, &mtr2).page; @@ -1861,10 +2343,8 @@ ibuf_remove_free_page(void) because in fseg_free_page we access level 1 pages, and the root is a level 2 page. */ - mtr_commit(&mtr2); - mutex_exit(&ibuf_mutex); - - ibuf_exit(); + ibuf_mtr_commit(&mtr2); + ibuf_exit(&mtr); /* Since pessimistic inserts were prevented, we know that the page is still in the free list. NOTE that also deletes may take @@ -1875,11 +2355,11 @@ ibuf_remove_free_page(void) fseg_free_page(header_page + IBUF_HEADER + IBUF_TREE_SEG_HEADER, IBUF_SPACE_ID, page_no, &mtr); -#ifdef UNIV_DEBUG_FILE_ACCESSES +#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG buf_page_reset_file_page_was_freed(IBUF_SPACE_ID, page_no); -#endif +#endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */ - ibuf_enter(); + ibuf_enter(&mtr); mutex_enter(&ibuf_mutex); @@ -1905,28 +2385,26 @@ ibuf_remove_free_page(void) flst_remove(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, page + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, &mtr); + mutex_exit(&ibuf_pessimistic_insert_mutex); + ibuf->seg_size--; ibuf->free_list_len--; - mutex_exit(&ibuf_pessimistic_insert_mutex); - /* Set the bit indicating that this page is no more an ibuf tree page (level 2 page) */ bitmap_page = ibuf_bitmap_get_map_page( IBUF_SPACE_ID, page_no, zip_size, &mtr); + mutex_exit(&ibuf_mutex); + ibuf_bitmap_page_set_bits( bitmap_page, page_no, zip_size, IBUF_BITMAP_IBUF, FALSE, &mtr); -#ifdef UNIV_DEBUG_FILE_ACCESSES +#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG buf_page_set_file_page_was_freed(IBUF_SPACE_ID, page_no); -#endif - mtr_commit(&mtr); - - mutex_exit(&ibuf_mutex); - - ibuf_exit(); +#endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */ + ibuf_mtr_commit(&mtr); } /***********************************************************************//** @@ -1948,8 +2426,6 @@ ibuf_free_excess_pages(void) ut_ad(rw_lock_get_x_lock_count( fil_space_get_latch(IBUF_SPACE_ID, NULL)) == 1); - ut_ad(!ibuf_inside()); - /* NOTE: We require that the thread did not own the latch before, because then we know that we can obey the correct latching order for ibuf latches */ @@ -1966,35 +2442,44 @@ ibuf_free_excess_pages(void) for (i = 0; i < 4; i++) { - mutex_enter(&ibuf_mutex); - - if (!ibuf_data_too_much_free()) { + ibool too_much_free; - mutex_exit(&ibuf_mutex); + mutex_enter(&ibuf_mutex); + too_much_free = ibuf_data_too_much_free(); + mutex_exit(&ibuf_mutex); + if (!too_much_free) { return; } - mutex_exit(&ibuf_mutex); - ibuf_remove_free_page(); } } +#ifdef UNIV_DEBUG +# define ibuf_get_merge_page_nos(contract,rec,mtr,ids,vers,pages,n_stored) \ + ibuf_get_merge_page_nos_func(contract,rec,mtr,ids,vers,pages,n_stored) +#else /* UNIV_DEBUG */ +# define ibuf_get_merge_page_nos(contract,rec,mtr,ids,vers,pages,n_stored) \ + ibuf_get_merge_page_nos_func(contract,rec,ids,vers,pages,n_stored) +#endif /* UNIV_DEBUG */ + /*********************************************************************//** Reads page numbers from a leaf in an ibuf tree. @return a lower limit for the combined volume of records which will be merged */ static ulint -ibuf_get_merge_page_nos( -/*====================*/ +ibuf_get_merge_page_nos_func( +/*=========================*/ ibool contract,/*!< in: TRUE if this function is called to contract the tree, FALSE if this is called when a single page becomes full and we look if it pays to read also nearby pages */ - rec_t* rec, /*!< in: record from which we read up and down - in the chain of records */ + const rec_t* rec, /*!< in: insert buffer record */ +#ifdef UNIV_DEBUG + mtr_t* mtr, /*!< in: mini-transaction holding rec */ +#endif /* UNIV_DEBUG */ ulint* space_ids,/*!< in/out: space id's of the pages */ ib_int64_t* space_versions,/*!< in/out: tablespace version timestamps; used to prevent reading in old @@ -2017,18 +2502,22 @@ ibuf_get_merge_page_nos( ulint limit; ulint n_pages; + ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX) + || mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_S_FIX)); + ut_ad(ibuf_inside(mtr)); + *n_stored = 0; - limit = ut_min(IBUF_MAX_N_PAGES_MERGED, buf_pool->curr_size / 4); + limit = ut_min(IBUF_MAX_N_PAGES_MERGED, buf_pool_get_curr_size() / 4); if (page_rec_is_supremum(rec)) { - rec = page_rec_get_prev(rec); + rec = page_rec_get_prev_const(rec); } if (page_rec_is_infimum(rec)) { - rec = page_rec_get_next(rec); + rec = page_rec_get_next_const(rec); } if (page_rec_is_supremum(rec)) { @@ -2036,8 +2525,8 @@ ibuf_get_merge_page_nos( return(0); } - first_page_no = ibuf_rec_get_page_no(rec); - first_space_id = ibuf_rec_get_space(rec); + first_page_no = ibuf_rec_get_page_no(mtr, rec); + first_space_id = ibuf_rec_get_space(mtr, rec); n_pages = 0; prev_page_no = 0; prev_space_id = 0; @@ -2048,8 +2537,8 @@ ibuf_get_merge_page_nos( while (!page_rec_is_infimum(rec) && UNIV_LIKELY(n_pages < limit)) { - rec_page_no = ibuf_rec_get_page_no(rec); - rec_space_id = ibuf_rec_get_space(rec); + rec_page_no = ibuf_rec_get_page_no(mtr, rec); + rec_space_id = ibuf_rec_get_space(mtr, rec); if (rec_space_id != first_space_id || (rec_page_no / IBUF_MERGE_AREA) @@ -2066,10 +2555,10 @@ ibuf_get_merge_page_nos( prev_page_no = rec_page_no; prev_space_id = rec_space_id; - rec = page_rec_get_prev(rec); + rec = page_rec_get_prev_const(rec); } - rec = page_rec_get_next(rec); + rec = page_rec_get_next_const(rec); /* At the loop start there is no prev page; we mark this with a pair of space id, page no (0, 0) for which there can never be entries in @@ -2087,8 +2576,8 @@ ibuf_get_merge_page_nos( rec_page_no = 1; rec_space_id = 0; } else { - rec_page_no = ibuf_rec_get_page_no(rec); - rec_space_id = ibuf_rec_get_space(rec); + rec_page_no = ibuf_rec_get_page_no(mtr, rec); + rec_space_id = ibuf_rec_get_space(mtr, rec); ut_ad(rec_page_no > IBUF_TREE_ROOT_PAGE_NO); } @@ -2099,9 +2588,9 @@ ibuf_get_merge_page_nos( || rec_page_no != prev_page_no) && (prev_space_id != 0 || prev_page_no != 0)) { - if ((prev_page_no == first_page_no - && prev_space_id == first_space_id) - || contract + if (contract + || (prev_page_no == first_page_no + && prev_space_id == first_space_id) || (volume_for_page > ((IBUF_MERGE_THRESHOLD - 1) * 4 * UNIV_PAGE_SIZE @@ -2134,14 +2623,14 @@ ibuf_get_merge_page_nos( break; } - rec_volume = ibuf_rec_get_volume(rec); + rec_volume = ibuf_rec_get_volume(mtr, rec); volume_for_page += rec_volume; prev_page_no = rec_page_no; prev_space_id = rec_space_id; - rec = page_rec_get_next(rec); + rec = page_rec_get_next_const(rec); } #ifdef UNIV_IBUF_DEBUG @@ -2172,65 +2661,59 @@ ibuf_contract_ext( ulint page_nos[IBUF_MAX_N_PAGES_MERGED]; ulint space_ids[IBUF_MAX_N_PAGES_MERGED]; ib_int64_t space_versions[IBUF_MAX_N_PAGES_MERGED]; - ulint n_stored; ulint sum_sizes; mtr_t mtr; *n_pages = 0; - ut_ad(!ibuf_inside()); - - mutex_enter(&ibuf_mutex); - if (ibuf->empty) { -ibuf_is_empty: - mutex_exit(&ibuf_mutex); + /* We perform a dirty read of ibuf->empty, without latching + the insert buffer root page. We trust this dirty read except + when a slow shutdown is being executed. During a slow + shutdown, the insert buffer merge must be completed. */ + if (UNIV_UNLIKELY(ibuf->empty) + && UNIV_LIKELY(!srv_shutdown_state)) { return(0); } - mtr_start(&mtr); - - ibuf_enter(); + ibuf_mtr_start(&mtr); /* Open a cursor to a randomly chosen leaf of the tree, at a random position within the leaf */ btr_pcur_open_at_rnd_pos(ibuf->index, BTR_SEARCH_LEAF, &pcur, &mtr); - if (page_get_n_recs(btr_pcur_get_page(&pcur)) == 0) { - /* When the ibuf tree is emptied completely, the last record - is removed using an optimistic delete and ibuf_size_update - is not called, causing ibuf->empty to remain FALSE. If we do - not reset it to TRUE here then database shutdown will hang - in the loop in ibuf_contract_for_n_pages. */ + ut_ad(page_validate(btr_pcur_get_page(&pcur), ibuf->index)); - ibuf->empty = TRUE; - - ibuf_exit(); - - mtr_commit(&mtr); + if (page_get_n_recs(btr_pcur_get_page(&pcur)) == 0) { + /* If a B-tree page is empty, it must be the root page + and the whole B-tree must be empty. InnoDB does not + allow empty B-tree pages other than the root. */ + ut_ad(ibuf->empty); + ut_ad(page_get_space_id(btr_pcur_get_page(&pcur)) + == IBUF_SPACE_ID); + ut_ad(page_get_page_no(btr_pcur_get_page(&pcur)) + == FSP_IBUF_TREE_ROOT_PAGE_NO); + + ibuf_mtr_commit(&mtr); btr_pcur_close(&pcur); - goto ibuf_is_empty; + return(0); } - mutex_exit(&ibuf_mutex); - - sum_sizes = ibuf_get_merge_page_nos(TRUE, btr_pcur_get_rec(&pcur), + sum_sizes = ibuf_get_merge_page_nos(TRUE, + btr_pcur_get_rec(&pcur), &mtr, space_ids, space_versions, - page_nos, &n_stored); + page_nos, n_pages); #if 0 /* defined UNIV_IBUF_DEBUG */ fprintf(stderr, "Ibuf contract sync %lu pages %lu volume %lu\n", - sync, n_stored, sum_sizes); + sync, *n_pages, sum_sizes); #endif - ibuf_exit(); - - mtr_commit(&mtr); + ibuf_mtr_commit(&mtr); btr_pcur_close(&pcur); buf_read_ibuf_merge_pages(sync, space_ids, space_versions, page_nos, - n_stored); - *n_pages = n_stored; + *n_pages); return(sum_sizes + 1); } @@ -2300,34 +2783,201 @@ ibuf_contract_after_insert( ibool sync; ulint sum_sizes; ulint size; - - mutex_enter(&ibuf_mutex); + ulint max_size; + + /* Perform dirty reads of ibuf->size and ibuf->max_size, to + reduce ibuf_mutex contention. ibuf->max_size remains constant + after ibuf_init_at_db_start(), but ibuf->size should be + protected by ibuf_mutex. Given that ibuf->size fits in a + machine word, this should be OK; at worst we are doing some + excessive ibuf_contract() or occasionally skipping a + ibuf_contract(). */ + size = ibuf->size; + max_size = ibuf->max_size; if (!srv_ibuf_active_contract) { - if (ibuf->size < ibuf->max_size + IBUF_CONTRACT_ON_INSERT_NON_SYNC) { - mutex_exit(&ibuf_mutex); - + if (size < max_size + IBUF_CONTRACT_ON_INSERT_NON_SYNC) { return; } } - sync = FALSE; - - if (ibuf->size >= ibuf->max_size + IBUF_CONTRACT_ON_INSERT_SYNC) { - - sync = TRUE; - } - - mutex_exit(&ibuf_mutex); + sync = (size >= max_size + IBUF_CONTRACT_ON_INSERT_SYNC); /* Contract at least entry_size many bytes */ sum_sizes = 0; size = 1; - while ((size > 0) && (sum_sizes < entry_size)) { + do { size = ibuf_contract(sync); sum_sizes += size; + } while (size > 0 && sum_sizes < entry_size); +} + +/*********************************************************************//** +Determine if an insert buffer record has been encountered already. +@return TRUE if a new record, FALSE if possible duplicate */ +static +ibool +ibuf_get_volume_buffered_hash( +/*==========================*/ + const rec_t* rec, /*!< in: ibuf record in post-4.1 format */ + const byte* types, /*!< in: fields */ + const byte* data, /*!< in: start of user record data */ + ulint comp, /*!< in: 0=ROW_FORMAT=REDUNDANT, + nonzero=ROW_FORMAT=COMPACT */ + ulint* hash, /*!< in/out: hash array */ + ulint size) /*!< in: number of elements in hash array */ +{ + ulint len; + ulint fold; + ulint bitmask; + + len = ibuf_rec_get_size(rec, types, rec_get_n_fields_old(rec) - 4, + FALSE, comp); + fold = ut_fold_binary(data, len); + + hash += (fold / (CHAR_BIT * sizeof *hash)) % size; + bitmask = 1 << (fold % (CHAR_BIT * sizeof *hash)); + + if (*hash & bitmask) { + + return(FALSE); + } + + /* We have not seen this record yet. Insert it. */ + *hash |= bitmask; + + return(TRUE); +} + +#ifdef UNIV_DEBUG +# define ibuf_get_volume_buffered_count(mtr,rec,hash,size,n_recs) \ + ibuf_get_volume_buffered_count_func(mtr,rec,hash,size,n_recs) +#else /* UNIV_DEBUG */ +# define ibuf_get_volume_buffered_count(mtr,rec,hash,size,n_recs) \ + ibuf_get_volume_buffered_count_func(rec,hash,size,n_recs) +#endif +/*********************************************************************//** +Update the estimate of the number of records on a page, and +get the space taken by merging the buffered record to the index page. +@return size of index record in bytes + an upper limit of the space +taken in the page directory */ +static +ulint +ibuf_get_volume_buffered_count_func( +/*================================*/ +#ifdef UNIV_DEBUG + mtr_t* mtr, /*!< in: mini-transaction owning rec */ +#endif /* UNIV_DEBUG */ + const rec_t* rec, /*!< in: insert buffer record */ + ulint* hash, /*!< in/out: hash array */ + ulint size, /*!< in: number of elements in hash array */ + lint* n_recs) /*!< in/out: estimated number of records + on the page that rec points to */ +{ + ulint len; + ibuf_op_t ibuf_op; + const byte* types; + ulint n_fields; + + ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX) + || mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_S_FIX)); + ut_ad(ibuf_inside(mtr)); + + n_fields = rec_get_n_fields_old(rec); + ut_ad(n_fields > 4); + n_fields -= 4; + + rec_get_nth_field_offs_old(rec, 1, &len); + /* This function is only invoked when buffering new + operations. All pre-4.1 records should have been merged + when the database was started up. */ + ut_a(len == 1); + ut_ad(trx_sys_multiple_tablespace_format); + + types = rec_get_nth_field_old(rec, 3, &len); + + switch (UNIV_EXPECT(len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE, + IBUF_REC_INFO_SIZE)) { + default: + ut_error; + case 0: + /* This ROW_TYPE=REDUNDANT record does not include an + operation counter. Exclude it from the *n_recs, + because deletes cannot be buffered if there are + old-style inserts buffered for the page. */ + + len = ibuf_rec_get_size(rec, types, n_fields, FALSE, 0); + + return(len + + rec_get_converted_extra_size(len, n_fields, 0) + + page_dir_calc_reserved_space(1)); + case 1: + /* This ROW_TYPE=COMPACT record does not include an + operation counter. Exclude it from the *n_recs, + because deletes cannot be buffered if there are + old-style inserts buffered for the page. */ + goto get_volume_comp; + + case IBUF_REC_INFO_SIZE: + ibuf_op = (ibuf_op_t) types[IBUF_REC_OFFSET_TYPE]; + break; + } + + switch (ibuf_op) { + case IBUF_OP_INSERT: + /* Inserts can be done by updating a delete-marked record. + Because delete-mark and insert operations can be pointing to + the same records, we must not count duplicates. */ + case IBUF_OP_DELETE_MARK: + /* There must be a record to delete-mark. + See if this record has been already buffered. */ + if (n_recs && ibuf_get_volume_buffered_hash( + rec, types + IBUF_REC_INFO_SIZE, + types + len, + types[IBUF_REC_OFFSET_FLAGS] & IBUF_REC_COMPACT, + hash, size)) { + (*n_recs)++; + } + + if (ibuf_op == IBUF_OP_DELETE_MARK) { + /* Setting the delete-mark flag does not + affect the available space on the page. */ + return(0); + } + break; + case IBUF_OP_DELETE: + /* A record will be removed from the page. */ + if (n_recs) { + (*n_recs)--; + } + /* While deleting a record actually frees up space, + we have to play it safe and pretend that it takes no + additional space (the record might not exist, etc.). */ + return(0); + default: + ut_error; + } + + ut_ad(ibuf_op == IBUF_OP_INSERT); + +get_volume_comp: + { + dtuple_t* entry; + ulint volume; + dict_index_t* dummy_index; + mem_heap_t* heap = mem_heap_create(500); + + entry = ibuf_build_entry_from_ibuf_rec( + mtr, rec, heap, &dummy_index); + + volume = rec_get_converted_size(dummy_index, entry, 0); + + ibuf_dummy_index_free(dummy_index); + mem_heap_free(heap); + + return(volume + page_dir_calc_reserved_space(1)); } } @@ -2341,55 +2991,63 @@ static ulint ibuf_get_volume_buffered( /*=====================*/ - btr_pcur_t* pcur, /*!< in: pcur positioned at a place in an + const btr_pcur_t*pcur, /*!< in: pcur positioned at a place in an insert buffer tree where we would insert an entry for the index page whose number is page_no, latch mode has to be BTR_MODIFY_PREV or BTR_MODIFY_TREE */ ulint space, /*!< in: space id */ ulint page_no,/*!< in: page number of an index page */ - mtr_t* mtr) /*!< in: mtr */ + lint* n_recs, /*!< in/out: minimum number of records on the + page after the buffered changes have been + applied, or NULL to disable the counting */ + mtr_t* mtr) /*!< in: mini-transaction of pcur */ { - ulint volume; - rec_t* rec; - page_t* page; - ulint prev_page_no; - page_t* prev_page; - ulint next_page_no; - page_t* next_page; + ulint volume; + const rec_t* rec; + const page_t* page; + ulint prev_page_no; + const page_t* prev_page; + ulint next_page_no; + const page_t* next_page; + /* bitmap of buffered recs */ + ulint hash_bitmap[128 / sizeof(ulint)]; ut_a(trx_sys_multiple_tablespace_format); ut_ad((pcur->latch_mode == BTR_MODIFY_PREV) || (pcur->latch_mode == BTR_MODIFY_TREE)); - /* Count the volume of records earlier in the alphabetical order than + /* Count the volume of inserts earlier in the alphabetical order than pcur */ volume = 0; + if (n_recs) { + memset(hash_bitmap, 0, sizeof hash_bitmap); + } + rec = btr_pcur_get_rec(pcur); page = page_align(rec); + ut_ad(page_validate(page, ibuf->index)); if (page_rec_is_supremum(rec)) { - rec = page_rec_get_prev(rec); + rec = page_rec_get_prev_const(rec); } - for (;;) { - if (page_rec_is_infimum(rec)) { + for (; !page_rec_is_infimum(rec); + rec = page_rec_get_prev_const(rec)) { + ut_ad(page_align(rec) == page); - break; - } - - if (page_no != ibuf_rec_get_page_no(rec) - || space != ibuf_rec_get_space(rec)) { + if (page_no != ibuf_rec_get_page_no(mtr, rec) + || space != ibuf_rec_get_space(mtr, rec)) { goto count_later; } - volume += ibuf_rec_get_volume(rec); - - rec = page_rec_get_prev(rec); + volume += ibuf_get_volume_buffered_count( + mtr, rec, + hash_bitmap, UT_ARR_SIZE(hash_bitmap), n_recs); } /* Look at the previous page */ @@ -2405,23 +3063,26 @@ ibuf_get_volume_buffered( buf_block_t* block; block = buf_page_get( - IBUF_SPACE_ID, 0, prev_page_no, RW_X_LATCH, mtr); + IBUF_SPACE_ID, 0, prev_page_no, RW_X_LATCH, + mtr); buf_block_dbg_add_level(block, SYNC_TREE_NODE); prev_page = buf_block_get_frame(block); + ut_ad(page_validate(prev_page, ibuf->index)); } #ifdef UNIV_BTR_DEBUG - ut_a(btr_page_get_next(prev_page, mtr) - == page_get_page_no(page)); + ut_a(btr_page_get_next(prev_page, mtr) == page_get_page_no(page)); #endif /* UNIV_BTR_DEBUG */ rec = page_get_supremum_rec(prev_page); - rec = page_rec_get_prev(rec); + rec = page_rec_get_prev_const(rec); + + for (;; rec = page_rec_get_prev_const(rec)) { + ut_ad(page_align(rec) == prev_page); - for (;;) { if (page_rec_is_infimum(rec)) { /* We cannot go to yet a previous page, because we @@ -2431,39 +3092,35 @@ ibuf_get_volume_buffered( return(UNIV_PAGE_SIZE); } - if (page_no != ibuf_rec_get_page_no(rec) - || space != ibuf_rec_get_space(rec)) { + if (page_no != ibuf_rec_get_page_no(mtr, rec) + || space != ibuf_rec_get_space(mtr, rec)) { goto count_later; } - volume += ibuf_rec_get_volume(rec); - - rec = page_rec_get_prev(rec); + volume += ibuf_get_volume_buffered_count( + mtr, rec, + hash_bitmap, UT_ARR_SIZE(hash_bitmap), n_recs); } count_later: rec = btr_pcur_get_rec(pcur); if (!page_rec_is_supremum(rec)) { - rec = page_rec_get_next(rec); + rec = page_rec_get_next_const(rec); } - for (;;) { - if (page_rec_is_supremum(rec)) { - - break; - } - - if (page_no != ibuf_rec_get_page_no(rec) - || space != ibuf_rec_get_space(rec)) { + for (; !page_rec_is_supremum(rec); + rec = page_rec_get_next_const(rec)) { + if (page_no != ibuf_rec_get_page_no(mtr, rec) + || space != ibuf_rec_get_space(mtr, rec)) { return(volume); } - volume += ibuf_rec_get_volume(rec); - - rec = page_rec_get_next(rec); + volume += ibuf_get_volume_buffered_count( + mtr, rec, + hash_bitmap, UT_ARR_SIZE(hash_bitmap), n_recs); } /* Look at the next page */ @@ -2479,12 +3136,14 @@ count_later: buf_block_t* block; block = buf_page_get( - IBUF_SPACE_ID, 0, next_page_no, RW_X_LATCH, mtr); + IBUF_SPACE_ID, 0, next_page_no, RW_X_LATCH, + mtr); buf_block_dbg_add_level(block, SYNC_TREE_NODE); next_page = buf_block_get_frame(block); + ut_ad(page_validate(next_page, ibuf->index)); } #ifdef UNIV_BTR_DEBUG @@ -2492,9 +3151,11 @@ count_later: #endif /* UNIV_BTR_DEBUG */ rec = page_get_infimum_rec(next_page); - rec = page_rec_get_next(rec); + rec = page_rec_get_next_const(rec); + + for (;; rec = page_rec_get_next_const(rec)) { + ut_ad(page_align(rec) == next_page); - for (;;) { if (page_rec_is_supremum(rec)) { /* We give up */ @@ -2502,15 +3163,15 @@ count_later: return(UNIV_PAGE_SIZE); } - if (page_no != ibuf_rec_get_page_no(rec) - || space != ibuf_rec_get_space(rec)) { + if (page_no != ibuf_rec_get_page_no(mtr, rec) + || space != ibuf_rec_get_space(mtr, rec)) { return(volume); } - volume += ibuf_rec_get_volume(rec); - - rec = page_rec_get_next(rec); + volume += ibuf_get_volume_buffered_count( + mtr, rec, + hash_bitmap, UT_ARR_SIZE(hash_bitmap), n_recs); } } @@ -2531,13 +3192,13 @@ ibuf_update_max_tablespace_id(void) ut_a(!dict_table_is_comp(ibuf->index->table)); - ibuf_enter(); - - mtr_start(&mtr); + ibuf_mtr_start(&mtr); btr_pcur_open_at_index_side( FALSE, ibuf->index, BTR_SEARCH_LEAF, &pcur, TRUE, &mtr); + ut_ad(page_validate(btr_pcur_get_page(&pcur), ibuf->index)); + btr_pcur_move_to_prev(&pcur, &mtr); if (btr_pcur_is_before_first_on_page(&pcur)) { @@ -2554,23 +3215,248 @@ ibuf_update_max_tablespace_id(void) max_space_id = mach_read_from_4(field); } - mtr_commit(&mtr); - ibuf_exit(); + ibuf_mtr_commit(&mtr); /* printf("Maximum space id in insert buffer %lu\n", max_space_id); */ fil_set_max_space_id_if_bigger(max_space_id); } +#ifdef UNIV_DEBUG +# define ibuf_get_entry_counter_low(mtr,rec,space,page_no) \ + ibuf_get_entry_counter_low_func(mtr,rec,space,page_no) +#else /* UNIV_DEBUG */ +# define ibuf_get_entry_counter_low(mtr,rec,space,page_no) \ + ibuf_get_entry_counter_low_func(rec,space,page_no) +#endif +/****************************************************************//** +Helper function for ibuf_set_entry_counter. Checks if rec is for (space, +page_no), and if so, reads counter value from it and returns that + 1. +Otherwise, returns 0. +@return new counter value, or 0 */ +static +ulint +ibuf_get_entry_counter_low_func( +/*============================*/ +#ifdef UNIV_DEBUG + mtr_t* mtr, /*!< in: mini-transaction of rec */ +#endif /* UNIV_DEBUG */ + const rec_t* rec, /*!< in: insert buffer record */ + ulint space, /*!< in: space id */ + ulint page_no) /*!< in: page number */ +{ + ulint counter; + const byte* field; + ulint len; + + ut_ad(ibuf_inside(mtr)); + ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX) + || mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_S_FIX)); + ut_ad(rec_get_n_fields_old(rec) > 2); + + field = rec_get_nth_field_old(rec, 1, &len); + + if (UNIV_UNLIKELY(len != 1)) { + /* pre-4.1 format */ + ut_a(trx_doublewrite_must_reset_space_ids); + ut_a(!trx_sys_multiple_tablespace_format); + + return(ULINT_UNDEFINED); + } + + ut_a(trx_sys_multiple_tablespace_format); + + /* Check the tablespace identifier. */ + field = rec_get_nth_field_old(rec, 0, &len); + ut_a(len == 4); + + if (mach_read_from_4(field) != space) { + + return(0); + } + + /* Check the page offset. */ + field = rec_get_nth_field_old(rec, 2, &len); + ut_a(len == 4); + + if (mach_read_from_4(field) != page_no) { + + return(0); + } + + /* Check if the record contains a counter field. */ + field = rec_get_nth_field_old(rec, 3, &len); + + switch (len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE) { + default: + ut_error; + case 0: /* ROW_FORMAT=REDUNDANT */ + case 1: /* ROW_FORMAT=COMPACT */ + return(ULINT_UNDEFINED); + + case IBUF_REC_INFO_SIZE: + counter = mach_read_from_2(field + IBUF_REC_OFFSET_COUNTER); + ut_a(counter < 0xFFFF); + return(counter + 1); + } +} + +/****************************************************************//** +Set the counter field in entry to the correct value based on the current +last record in ibuf for (space, page_no). +@return FALSE if we should abort this insertion to ibuf */ +static +ibool +ibuf_set_entry_counter( +/*===================*/ + dtuple_t* entry, /*!< in/out: entry to patch */ + ulint space, /*!< in: space id of entry */ + ulint page_no, /*!< in: page number of entry */ + btr_pcur_t* pcur, /*!< in: pcur positioned on the record + found by btr_pcur_open(.., entry, + PAGE_CUR_LE, ..., pcur, ...) */ + ibool is_optimistic, /*!< in: is this an optimistic insert */ + mtr_t* mtr) /*!< in: mtr */ +{ + dfield_t* field; + byte* data; + ulint counter = 0; + + /* pcur points to either a user rec or to a page's infimum record. */ + ut_ad(ibuf_inside(mtr)); + ut_ad(mtr_memo_contains(mtr, btr_pcur_get_block(pcur), + MTR_MEMO_PAGE_X_FIX)); + ut_ad(page_validate(btr_pcur_get_page(pcur), ibuf->index)); + + if (btr_pcur_is_on_user_rec(pcur)) { + + counter = ibuf_get_entry_counter_low( + mtr, btr_pcur_get_rec(pcur), space, page_no); + + if (UNIV_UNLIKELY(counter == ULINT_UNDEFINED)) { + /* The record lacks a counter field. + Such old records must be merged before + new records can be buffered. */ + + return(FALSE); + } + } else if (btr_pcur_is_before_first_in_tree(pcur, mtr)) { + /* Ibuf tree is either completely empty, or the insert + position is at the very first record of a non-empty tree. In + either case we have no previous records for (space, + page_no). */ + + counter = 0; + } else if (btr_pcur_is_before_first_on_page(pcur)) { + btr_cur_t* cursor = btr_pcur_get_btr_cur(pcur); + + if (cursor->low_match < 3) { + /* If low_match < 3, we know that the father node + pointer did not contain the searched for (space, + page_no), which means that the search ended on the + right page regardless of the counter value, and + since we're at the infimum record, there are no + existing records. */ + + counter = 0; + } else { + rec_t* rec; + const page_t* page; + buf_block_t* block; + page_t* prev_page; + ulint prev_page_no; + + ut_a(cursor->ibuf_cnt != ULINT_UNDEFINED); + + page = btr_pcur_get_page(pcur); + prev_page_no = btr_page_get_prev(page, mtr); + + ut_a(prev_page_no != FIL_NULL); + + block = buf_page_get( + IBUF_SPACE_ID, 0, prev_page_no, + RW_X_LATCH, mtr); + + buf_block_dbg_add_level(block, SYNC_TREE_NODE); + + prev_page = buf_block_get_frame(block); + + rec = page_rec_get_prev( + page_get_supremum_rec(prev_page)); + + ut_ad(page_rec_is_user_rec(rec)); + + counter = ibuf_get_entry_counter_low( + mtr, rec, space, page_no); + + if (UNIV_UNLIKELY(counter == ULINT_UNDEFINED)) { + /* The record lacks a counter field. + Such old records must be merged before + new records can be buffered. */ + + return(FALSE); + } + + if (counter < cursor->ibuf_cnt) { + /* Search ended on the wrong page. */ + + if (is_optimistic) { + /* In an optimistic insert, we can + shift the insert position to the left + page, since it only needs an X-latch + on the page itself, which the + original search acquired for us. */ + + btr_cur_position( + ibuf->index, rec, block, + btr_pcur_get_btr_cur(pcur)); + } else { + /* We can't shift the insert + position to the left page in a + pessimistic insert since it would + require an X-latch on the left + page's left page, so we have to + abort. */ + + return(FALSE); + } + } else { + /* The counter field in the father node is + the same as we would insert; we don't know + whether the insert should go to this page or + the left page (the later fields can differ), + so refuse the insert. */ + + return(FALSE); + } + } + } else { + /* The cursor is not positioned at or before a user record. */ + return(FALSE); + } + + /* Patch counter value in already built entry. */ + field = dtuple_get_nth_field(entry, 3); + data = dfield_get_data(field); + + mach_write_to_2(data + IBUF_REC_OFFSET_COUNTER, counter); + + return(TRUE); +} + /*********************************************************************//** -Makes an index insert to the insert buffer, instead of directly to the disk -page, if this is possible. -@return DB_SUCCESS, DB_FAIL, DB_STRONG_FAIL */ +Buffer an operation in the insert/delete buffer, instead of doing it +directly to the disk page, if this is possible. +@return DB_SUCCESS, DB_STRONG_FAIL or other error */ static ulint ibuf_insert_low( /*============*/ ulint mode, /*!< in: BTR_MODIFY_PREV or BTR_MODIFY_TREE */ + ibuf_op_t op, /*!< in: operation type */ + ibool no_counter, + /*!< in: TRUE=use 5.0.3 format; + FALSE=allow delete buffering */ const dtuple_t* entry, /*!< in: index entry to insert */ ulint entry_size, /*!< in: rec_get_converted_size(index, entry) */ @@ -2587,9 +3473,11 @@ ibuf_insert_low( dtuple_t* ibuf_entry; mem_heap_t* heap; ulint buffered; + lint min_n_recs; rec_t* ins_rec; ibool old_bit_value; page_t* bitmap_page; + buf_block_t* block; page_t* root; ulint err; ibool do_merge; @@ -2597,25 +3485,30 @@ ibuf_insert_low( ib_int64_t space_versions[IBUF_MAX_N_PAGES_MERGED]; ulint page_nos[IBUF_MAX_N_PAGES_MERGED]; ulint n_stored; - ulint bits; mtr_t mtr; mtr_t bitmap_mtr; ut_a(!dict_index_is_clust(index)); ut_ad(dtuple_check_typed(entry)); ut_ad(ut_is_2pow(zip_size)); + ut_ad(!no_counter || op == IBUF_OP_INSERT); + ut_a(op < IBUF_OP_COUNT); ut_a(trx_sys_multiple_tablespace_format); do_merge = FALSE; - mutex_enter(&ibuf_mutex); - + /* Perform dirty reads of ibuf->size and ibuf->max_size, to + reduce ibuf_mutex contention. ibuf->max_size remains constant + after ibuf_init_at_db_start(), but ibuf->size should be + protected by ibuf_mutex. Given that ibuf->size fits in a + machine word, this should be OK; at worst we are doing some + excessive ibuf_contract() or occasionally skipping a + ibuf_contract(). */ if (ibuf->size >= ibuf->max_size + IBUF_CONTRACT_DO_NOT_INSERT) { /* Insert buffer is now too big, contract it but do not try to insert */ - mutex_exit(&ibuf_mutex); #ifdef UNIV_IBUF_DEBUG fputs("Ibuf too big\n", stderr); @@ -2626,64 +3519,101 @@ ibuf_insert_low( return(DB_STRONG_FAIL); } - mutex_exit(&ibuf_mutex); + heap = mem_heap_create(512); - if (mode == BTR_MODIFY_TREE) { - mutex_enter(&ibuf_pessimistic_insert_mutex); + /* Build the entry which contains the space id and the page number + as the first fields and the type information for other fields, and + which will be inserted to the insert buffer. Using a counter value + of 0xFFFF we find the last record for (space, page_no), from which + we can then read the counter value N and use N + 1 in the record we + insert. (We patch the ibuf_entry's counter field to the correct + value just before actually inserting the entry.) */ - ibuf_enter(); + ibuf_entry = ibuf_entry_build( + op, index, entry, space, page_no, + no_counter ? ULINT_UNDEFINED : 0xFFFF, heap); - mutex_enter(&ibuf_mutex); + /* Open a cursor to the insert buffer tree to calculate if we can add + the new entry to it without exceeding the free space limit for the + page. */ - while (!ibuf_data_enough_free_for_insert()) { + if (mode == BTR_MODIFY_TREE) { + for (;;) { + mutex_enter(&ibuf_pessimistic_insert_mutex); + mutex_enter(&ibuf_mutex); - mutex_exit(&ibuf_mutex); + if (UNIV_LIKELY(ibuf_data_enough_free_for_insert())) { - ibuf_exit(); + break; + } + mutex_exit(&ibuf_mutex); mutex_exit(&ibuf_pessimistic_insert_mutex); - err = ibuf_add_free_page(); - - if (err == DB_STRONG_FAIL) { + if (UNIV_UNLIKELY(!ibuf_add_free_page())) { - return(err); + mem_heap_free(heap); + return(DB_STRONG_FAIL); } - - mutex_enter(&ibuf_pessimistic_insert_mutex); - - ibuf_enter(); - - mutex_enter(&ibuf_mutex); } - } else { - ibuf_enter(); } - heap = mem_heap_create(512); - - /* Build the entry which contains the space id and the page number as - the first fields and the type information for other fields, and which - will be inserted to the insert buffer. */ - - ibuf_entry = ibuf_entry_build(index, entry, space, page_no, heap); - - /* Open a cursor to the insert buffer tree to calculate if we can add - the new entry to it without exceeding the free space limit for the - page. */ - - mtr_start(&mtr); + ibuf_mtr_start(&mtr); btr_pcur_open(ibuf->index, ibuf_entry, PAGE_CUR_LE, mode, &pcur, &mtr); + ut_ad(page_validate(btr_pcur_get_page(&pcur), ibuf->index)); /* Find out the volume of already buffered inserts for the same index page */ - buffered = ibuf_get_volume_buffered(&pcur, space, page_no, &mtr); + min_n_recs = 0; + buffered = ibuf_get_volume_buffered(&pcur, space, page_no, + op == IBUF_OP_DELETE + ? &min_n_recs + : NULL, &mtr); + + if (op == IBUF_OP_DELETE + && (min_n_recs < 2 + || buf_pool_watch_occurred(space, page_no))) { + /* The page could become empty after the record is + deleted, or the page has been read in to the buffer + pool. Refuse to buffer the operation. */ + + /* The buffer pool watch is needed for IBUF_OP_DELETE + because of latching order considerations. We can + check buf_pool_watch_occurred() only after latching + the insert buffer B-tree pages that contain buffered + changes for the page. We never buffer IBUF_OP_DELETE, + unless some IBUF_OP_INSERT or IBUF_OP_DELETE_MARK have + been previously buffered for the page. Because there + are buffered operations for the page, the insert + buffer B-tree page latches held by mtr will guarantee + that no changes for the user page will be merged + before mtr_commit(&mtr). We must not mtr_commit(&mtr) + until after the IBUF_OP_DELETE has been buffered. */ + +fail_exit: + if (mode == BTR_MODIFY_TREE) { + mutex_exit(&ibuf_mutex); + mutex_exit(&ibuf_pessimistic_insert_mutex); + } + + err = DB_STRONG_FAIL; + goto func_exit; + } + + /* After this point, the page could still be loaded to the + buffer pool, but we do not have to care about it, since we are + holding a latch on the insert buffer leaf page that contains + buffered changes for (space, page_no). If the page enters the + buffer pool, buf_page_io_complete() for (space, page_no) will + have to acquire a latch on the same insert buffer leaf page, + which it cannot do until we have buffered the IBUF_OP_DELETE + and done mtr_commit(&mtr) to release the latch. */ #ifdef UNIV_IBUF_COUNT_DEBUG ut_a((buffered == 0) || ibuf_count_get(space, page_no)); #endif - mtr_start(&bitmap_mtr); + ibuf_mtr_start(&bitmap_mtr); bitmap_page = ibuf_bitmap_get_map_page(space, page_no, zip_size, &bitmap_mtr); @@ -2692,29 +3622,42 @@ ibuf_insert_low( if (buf_page_peek(space, page_no) || lock_rec_expl_exist_on_page(space, page_no)) { - err = DB_STRONG_FAIL; - - mtr_commit(&bitmap_mtr); - goto function_exit; + goto bitmap_fail; } - bits = ibuf_bitmap_page_get_bits(bitmap_page, page_no, zip_size, - IBUF_BITMAP_FREE, &bitmap_mtr); + if (op == IBUF_OP_INSERT) { + ulint bits = ibuf_bitmap_page_get_bits( + bitmap_page, page_no, zip_size, IBUF_BITMAP_FREE, + &bitmap_mtr); - if (buffered + entry_size + page_dir_calc_reserved_space(1) - > ibuf_index_page_calc_free_from_bits(zip_size, bits)) { - mtr_commit(&bitmap_mtr); + if (buffered + entry_size + page_dir_calc_reserved_space(1) + > ibuf_index_page_calc_free_from_bits(zip_size, bits)) { + /* Release the bitmap page latch early. */ + ibuf_mtr_commit(&bitmap_mtr); - /* It may not fit */ - err = DB_STRONG_FAIL; + /* It may not fit */ + do_merge = TRUE; + + ibuf_get_merge_page_nos(FALSE, + btr_pcur_get_rec(&pcur), &mtr, + space_ids, space_versions, + page_nos, &n_stored); + + goto fail_exit; + } + } - do_merge = TRUE; + /* Patch correct counter value to the entry to insert. This can + change the insert position, which can result in the need to abort in + some cases. */ + if (!no_counter + && !ibuf_set_entry_counter(ibuf_entry, space, page_no, &pcur, + mode == BTR_MODIFY_PREV, &mtr)) { +bitmap_fail: + ibuf_mtr_commit(&bitmap_mtr); - ibuf_get_merge_page_nos(FALSE, btr_pcur_get_rec(&pcur), - space_ids, space_versions, - page_nos, &n_stored); - goto function_exit; + goto fail_exit; } /* Set the bitmap bit denoting that the insert buffer contains @@ -2730,7 +3673,7 @@ ibuf_insert_low( &bitmap_mtr); } - mtr_commit(&bitmap_mtr); + ibuf_mtr_commit(&bitmap_mtr); cursor = btr_pcur_get_btr_cur(&pcur); @@ -2738,10 +3681,19 @@ ibuf_insert_low( err = btr_cur_optimistic_insert(BTR_NO_LOCKING_FLAG, cursor, ibuf_entry, &ins_rec, &dummy_big_rec, 0, thr, &mtr); - if (err == DB_SUCCESS) { - /* Update the page max trx id field */ - page_update_max_trx_id(btr_cur_get_block(cursor), NULL, - thr_get_trx(thr)->id, &mtr); + block = btr_cur_get_block(cursor); + ut_ad(buf_block_get_space(block) == IBUF_SPACE_ID); + + /* If this is the root page, update ibuf->empty. */ + if (UNIV_UNLIKELY(buf_block_get_page_no(block) + == FSP_IBUF_TREE_ROOT_PAGE_NO)) { + const page_t* root = buf_block_get_frame(block); + + ut_ad(page_get_space_id(root) == IBUF_SPACE_ID); + ut_ad(page_get_page_no(root) + == FSP_IBUF_TREE_ROOT_PAGE_NO); + + ibuf->empty = (page_get_n_recs(root) == 0); } } else { ut_ad(mode == BTR_MODIFY_TREE); @@ -2758,16 +3710,22 @@ ibuf_insert_low( cursor, ibuf_entry, &ins_rec, &dummy_big_rec, 0, thr, &mtr); - if (err == DB_SUCCESS) { - /* Update the page max trx id field */ - page_update_max_trx_id(btr_cur_get_block(cursor), NULL, - thr_get_trx(thr)->id, &mtr); - } - + mutex_exit(&ibuf_pessimistic_insert_mutex); ibuf_size_update(root, &mtr); + mutex_exit(&ibuf_mutex); + ibuf->empty = (page_get_n_recs(root) == 0); + + block = btr_cur_get_block(cursor); + ut_ad(buf_block_get_space(block) == IBUF_SPACE_ID); } -function_exit: + if (err == DB_SUCCESS && op != IBUF_OP_DELETE) { + /* Update the page max trx id field */ + page_update_max_trx_id(block, NULL, + thr_get_trx(thr)->id, &mtr); + } + +func_exit: #ifdef UNIV_IBUF_COUNT_DEBUG if (err == DB_SUCCESS) { fprintf(stderr, @@ -2779,29 +3737,14 @@ function_exit: ibuf_count_get(space, page_no) + 1); } #endif - if (mode == BTR_MODIFY_TREE) { - - mutex_exit(&ibuf_mutex); - mutex_exit(&ibuf_pessimistic_insert_mutex); - } - mtr_commit(&mtr); + ibuf_mtr_commit(&mtr); btr_pcur_close(&pcur); - ibuf_exit(); mem_heap_free(heap); - if (err == DB_SUCCESS) { - mutex_enter(&ibuf_mutex); - - ibuf->empty = FALSE; - ibuf->n_inserts++; - - mutex_exit(&ibuf_mutex); - - if (mode == BTR_MODIFY_TREE) { - ibuf_contract_after_insert(entry_size); - } + if (err == DB_SUCCESS && mode == BTR_MODIFY_TREE) { + ibuf_contract_after_insert(entry_size); } if (do_merge) { @@ -2816,14 +3759,15 @@ function_exit: } /*********************************************************************//** -Makes an index insert to the insert buffer, instead of directly to the disk -page, if this is possible. Does not do insert if the index is clustered -or unique. +Buffer an operation in the insert/delete buffer, instead of doing it +directly to the disk page, if this is possible. Does not do it if the index +is clustered or unique. @return TRUE if success */ UNIV_INTERN ibool ibuf_insert( /*========*/ + ibuf_op_t op, /*!< in: operation type */ const dtuple_t* entry, /*!< in: index entry to insert */ dict_index_t* index, /*!< in: index where to insert */ ulint space, /*!< in: space id where to insert */ @@ -2831,8 +3775,12 @@ ibuf_insert( ulint page_no,/*!< in: page number where to insert */ que_thr_t* thr) /*!< in: query thread */ { - ulint err; - ulint entry_size; + ulint err; + ulint entry_size; + ibool no_counter; + /* Read the settable global variable ibuf_use only once in + this function, so that we will have a consistent view of it. */ + ibuf_use_t use = ibuf_use; ut_a(trx_sys_multiple_tablespace_format); ut_ad(dtuple_check_typed(entry)); @@ -2840,30 +3788,111 @@ ibuf_insert( ut_a(!dict_index_is_clust(index)); - switch (UNIV_EXPECT(ibuf_use, IBUF_USE_INSERT)) { - case IBUF_USE_NONE: - return(FALSE); - case IBUF_USE_INSERT: - goto do_insert; - case IBUF_USE_COUNT: + no_counter = use <= IBUF_USE_INSERT; + + switch (op) { + case IBUF_OP_INSERT: + switch (use) { + case IBUF_USE_NONE: + case IBUF_USE_DELETE: + case IBUF_USE_DELETE_MARK: + return(FALSE); + case IBUF_USE_INSERT: + case IBUF_USE_INSERT_DELETE_MARK: + case IBUF_USE_ALL: + goto check_watch; + case IBUF_USE_COUNT: + break; + } + break; + case IBUF_OP_DELETE_MARK: + switch (use) { + case IBUF_USE_NONE: + case IBUF_USE_INSERT: + return(FALSE); + case IBUF_USE_DELETE_MARK: + case IBUF_USE_DELETE: + case IBUF_USE_INSERT_DELETE_MARK: + case IBUF_USE_ALL: + ut_ad(!no_counter); + goto check_watch; + case IBUF_USE_COUNT: + break; + } + break; + case IBUF_OP_DELETE: + switch (use) { + case IBUF_USE_NONE: + case IBUF_USE_INSERT: + case IBUF_USE_INSERT_DELETE_MARK: + return(FALSE); + case IBUF_USE_DELETE_MARK: + case IBUF_USE_DELETE: + case IBUF_USE_ALL: + ut_ad(!no_counter); + goto skip_watch; + case IBUF_USE_COUNT: + break; + } + break; + case IBUF_OP_COUNT: break; } - ut_error; /* unknown value of ibuf_use */ + /* unknown op or use */ + ut_error; + +check_watch: + /* If a thread attempts to buffer an insert on a page while a + purge is in progress on the same page, the purge must not be + buffered, because it could remove a record that was + re-inserted later. For simplicity, we block the buffering of + all operations on a page that has a purge pending. + + We do not check this in the IBUF_OP_DELETE case, because that + would always trigger the buffer pool watch during purge and + thus prevent the buffering of delete operations. We assume + that the issuer of IBUF_OP_DELETE has called + buf_pool_watch_set(space, page_no). */ + + { + buf_page_t* bpage; + ulint fold = buf_page_address_fold(space, page_no); + buf_pool_t* buf_pool = buf_pool_get(space, page_no); + + //buf_pool_mutex_enter(buf_pool); + rw_lock_s_lock(&buf_pool->page_hash_latch); + bpage = buf_page_hash_get_low(buf_pool, space, page_no, fold); + //buf_pool_mutex_exit(buf_pool); + rw_lock_s_unlock(&buf_pool->page_hash_latch); + + if (UNIV_LIKELY_NULL(bpage)) { + /* A buffer pool watch has been set or the + page has been read into the buffer pool. + Do not buffer the request. If a purge operation + is being buffered, have this request executed + directly on the page in the buffer pool after the + buffered entries for this page have been merged. */ + return(FALSE); + } + } -do_insert: +skip_watch: entry_size = rec_get_converted_size(index, entry, 0); if (entry_size - >= (page_get_free_space_of_empty(dict_table_is_comp(index->table)) - / 2)) { + >= page_get_free_space_of_empty(dict_table_is_comp(index->table)) + / 2) { + return(FALSE); } - err = ibuf_insert_low(BTR_MODIFY_PREV, entry, entry_size, + err = ibuf_insert_low(BTR_MODIFY_PREV, op, no_counter, + entry, entry_size, index, space, zip_size, page_no, thr); if (err == DB_FAIL) { - err = ibuf_insert_low(BTR_MODIFY_TREE, entry, entry_size, + err = ibuf_insert_low(BTR_MODIFY_TREE, op, no_counter, + entry, entry_size, index, space, zip_size, page_no, thr); } @@ -2886,9 +3915,80 @@ During merge, inserts to an index page a secondary index entry extracted from the insert buffer. */ static void +ibuf_insert_to_index_page_low( +/*==========================*/ + const dtuple_t* entry, /*!< in: buffered entry to insert */ + buf_block_t* block, /*!< in/out: index page where the buffered + entry should be placed */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr, /*!< in/out: mtr */ + page_cur_t* page_cur)/*!< in/out: cursor positioned on the record + after which to insert the buffered entry */ +{ + const page_t* page; + ulint space; + ulint page_no; + ulint zip_size; + const page_t* bitmap_page; + ulint old_bits; + + if (UNIV_LIKELY + (page_cur_tuple_insert(page_cur, entry, index, 0, mtr) != NULL)) { + return; + } + + /* If the record did not fit, reorganize */ + + btr_page_reorganize(block, index, mtr); + page_cur_search(block, index, entry, PAGE_CUR_LE, page_cur); + + /* This time the record must fit */ + + if (UNIV_LIKELY + (page_cur_tuple_insert(page_cur, entry, index, 0, mtr) != NULL)) { + return; + } + + page = buf_block_get_frame(block); + + ut_print_timestamp(stderr); + + fprintf(stderr, + " InnoDB: Error: Insert buffer insert fails;" + " page free %lu, dtuple size %lu\n", + (ulong) page_get_max_insert_size(page, 1), + (ulong) rec_get_converted_size(index, entry, 0)); + fputs("InnoDB: Cannot insert index record ", stderr); + dtuple_print(stderr, entry); + fputs("\nInnoDB: The table where this index record belongs\n" + "InnoDB: is now probably corrupt. Please run CHECK TABLE on\n" + "InnoDB: that table.\n", stderr); + + space = page_get_space_id(page); + zip_size = buf_block_get_zip_size(block); + page_no = page_get_page_no(page); + + bitmap_page = ibuf_bitmap_get_map_page(space, page_no, zip_size, mtr); + old_bits = ibuf_bitmap_page_get_bits(bitmap_page, page_no, zip_size, + IBUF_BITMAP_FREE, mtr); + + fprintf(stderr, + "InnoDB: space %lu, page %lu, zip_size %lu, bitmap bits %lu\n", + (ulong) space, (ulong) page_no, + (ulong) zip_size, (ulong) old_bits); + + fputs("InnoDB: Submit a detailed bug report" + " to http://bugs.mysql.com\n", stderr); +} + +/************************************************************************ +During merge, inserts to an index page a secondary index entry extracted +from the insert buffer. */ +static +void ibuf_insert_to_index_page( /*======================*/ - dtuple_t* entry, /*!< in: buffered entry to insert */ + const dtuple_t* entry, /*!< in: buffered entry to insert */ buf_block_t* block, /*!< in/out: index page where the buffered entry should be placed */ dict_index_t* index, /*!< in: record descriptor */ @@ -2898,11 +3998,10 @@ ibuf_insert_to_index_page( ulint low_match; page_t* page = buf_block_get_frame(block); rec_t* rec; - page_t* bitmap_page; - ulint old_bits; - ut_ad(ibuf_inside()); + ut_ad(ibuf_inside(mtr)); ut_ad(dtuple_check_typed(entry)); + ut_ad(!buf_block_align(page)->is_hashed); if (UNIV_UNLIKELY(dict_table_is_comp(index->table) != (ibool)!!page_is_comp(page))) { @@ -2915,6 +4014,14 @@ ibuf_insert_to_index_page( rec = page_rec_get_next(page_get_infimum_rec(page)); + if (page_rec_is_supremum(rec)) { + fputs("InnoDB: Trying to insert a record from" + " the insert buffer to an index page\n" + "InnoDB: but the index page is empty!\n", + stderr); + goto dump; + } + if (UNIV_UNLIKELY(rec_get_n_fields(rec, index) != dtuple_get_n_fields(entry))) { fputs("InnoDB: Trying to insert a record from" @@ -2940,72 +4047,280 @@ dump: low_match = page_cur_search(block, index, entry, PAGE_CUR_LE, &page_cur); - if (low_match == dtuple_get_n_fields(entry)) { + if (UNIV_UNLIKELY(low_match == dtuple_get_n_fields(entry))) { + mem_heap_t* heap; + upd_t* update; + ulint* offsets; page_zip_des_t* page_zip; rec = page_cur_get_rec(&page_cur); + + /* This is based on + row_ins_sec_index_entry_by_modify(BTR_MODIFY_LEAF). */ + ut_ad(rec_get_deleted_flag(rec, page_is_comp(page))); + + heap = mem_heap_create(1024); + + offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, + &heap); + update = row_upd_build_sec_rec_difference_binary( + index, entry, rec, NULL, heap); + page_zip = buf_block_get_page_zip(block); - btr_cur_del_unmark_for_ibuf(rec, page_zip, mtr); + if (update->n_fields == 0) { + /* The records only differ in the delete-mark. + Clear the delete-mark, like we did before + Bug #56680 was fixed. */ + btr_cur_set_deleted_flag_for_ibuf( + rec, page_zip, FALSE, mtr); +updated_in_place: + mem_heap_free(heap); + return; + } + + /* Copy the info bits. Clear the delete-mark. */ + update->info_bits = rec_get_info_bits(rec, page_is_comp(page)); + update->info_bits &= ~REC_INFO_DELETED_FLAG; + + /* We cannot invoke btr_cur_optimistic_update() here, + because we do not have a btr_cur_t or que_thr_t, + as the insert buffer merge occurs at a very low level. */ + if (!row_upd_changes_field_size_or_external(index, offsets, + update) + && (!page_zip || btr_cur_update_alloc_zip( + page_zip, block, index, + rec_offs_size(offsets), FALSE, mtr))) { + /* This is the easy case. Do something similar + to btr_cur_update_in_place(). */ + row_upd_rec_in_place(rec, index, offsets, + update, page_zip); + goto updated_in_place; + } + + /* A collation may identify values that differ in + storage length. + Some examples (1 or 2 bytes): + utf8_turkish_ci: I = U+0131 LATIN SMALL LETTER DOTLESS I + utf8_general_ci: S = U+00DF LATIN SMALL LETTER SHARP S + utf8_general_ci: A = U+00E4 LATIN SMALL LETTER A WITH DIAERESIS + + latin1_german2_ci: SS = U+00DF LATIN SMALL LETTER SHARP S + + Examples of a character (3-byte UTF-8 sequence) + identified with 2 or 4 characters (1-byte UTF-8 sequences): + + utf8_unicode_ci: 'II' = U+2171 SMALL ROMAN NUMERAL TWO + utf8_unicode_ci: '(10)' = U+247D PARENTHESIZED NUMBER TEN + */ + + /* Delete the different-length record, and insert the + buffered one. */ + + lock_rec_store_on_page_infimum(block, rec); + page_cur_delete_rec(&page_cur, index, offsets, mtr); + page_cur_move_to_prev(&page_cur); + mem_heap_free(heap); + + ibuf_insert_to_index_page_low(entry, block, index, mtr, + &page_cur); + lock_rec_restore_from_page_infimum(block, rec, block); } else { - rec = page_cur_tuple_insert(&page_cur, entry, index, 0, mtr); + ibuf_insert_to_index_page_low(entry, block, index, mtr, + &page_cur); + } +} - if (UNIV_LIKELY(rec != NULL)) { - return; +/****************************************************************//** +During merge, sets the delete mark on a record for a secondary index +entry. */ +static +void +ibuf_set_del_mark( +/*==============*/ + const dtuple_t* entry, /*!< in: entry */ + buf_block_t* block, /*!< in/out: block */ + const dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr) /*!< in: mtr */ +{ + page_cur_t page_cur; + ulint low_match; + + ut_ad(ibuf_inside(mtr)); + ut_ad(dtuple_check_typed(entry)); + + low_match = page_cur_search( + block, index, entry, PAGE_CUR_LE, &page_cur); + + if (low_match == dtuple_get_n_fields(entry)) { + rec_t* rec; + page_zip_des_t* page_zip; + + rec = page_cur_get_rec(&page_cur); + page_zip = page_cur_get_page_zip(&page_cur); + + /* Delete mark the old index record. According to a + comment in row_upd_sec_index_entry(), it can already + have been delete marked if a lock wait occurred in + row_ins_index_entry() in a previous invocation of + row_upd_sec_index_entry(). */ + + if (UNIV_LIKELY + (!rec_get_deleted_flag( + rec, dict_table_is_comp(index->table)))) { + btr_cur_set_deleted_flag_for_ibuf(rec, page_zip, + TRUE, mtr); + } + } else { + ut_print_timestamp(stderr); + fputs(" InnoDB: unable to find a record to delete-mark\n", + stderr); + fputs("InnoDB: tuple ", stderr); + dtuple_print(stderr, entry); + fputs("\n" + "InnoDB: record ", stderr); + rec_print(stderr, page_cur_get_rec(&page_cur), index); + putc('\n', stderr); + fputs("\n" + "InnoDB: Submit a detailed bug report" + " to http://bugs.mysql.com\n", stderr); + ut_ad(0); + } +} + +/****************************************************************//** +During merge, delete a record for a secondary index entry. */ +static +void +ibuf_delete( +/*========*/ + const dtuple_t* entry, /*!< in: entry */ + buf_block_t* block, /*!< in/out: block */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr) /*!< in/out: mtr; must be committed + before latching any further pages */ +{ + page_cur_t page_cur; + ulint low_match; + + ut_ad(ibuf_inside(mtr)); + ut_ad(dtuple_check_typed(entry)); + + low_match = page_cur_search( + block, index, entry, PAGE_CUR_LE, &page_cur); + + if (low_match == dtuple_get_n_fields(entry)) { + page_zip_des_t* page_zip= buf_block_get_page_zip(block); + page_t* page = buf_block_get_frame(block); + rec_t* rec = page_cur_get_rec(&page_cur); + + /* TODO: the below should probably be a separate function, + it's a bastardized version of btr_cur_optimistic_delete. */ + + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + mem_heap_t* heap = NULL; + ulint max_ins_size; + + rec_offs_init(offsets_); + + offsets = rec_get_offsets( + rec, index, offsets, ULINT_UNDEFINED, &heap); + + /* Refuse to delete the last record. */ + ut_a(page_get_n_recs(page) > 1); + + /* The record should have been marked for deletion. */ + ut_ad(REC_INFO_DELETED_FLAG + & rec_get_info_bits(rec, page_is_comp(page))); + + lock_update_delete(block, rec); + + if (!page_zip) { + max_ins_size + = page_get_max_insert_size_after_reorganize( + page, 1); + } +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page)); +#endif /* UNIV_ZIP_DEBUG */ + page_cur_delete_rec(&page_cur, index, offsets, mtr); +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page)); +#endif /* UNIV_ZIP_DEBUG */ + + if (page_zip) { + ibuf_update_free_bits_zip(block, mtr); + } else { + ibuf_update_free_bits_low(block, max_ins_size, mtr); } - /* If the record did not fit, reorganize */ + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + } else { + /* The record must have been purged already. */ + } +} - btr_page_reorganize(block, index, mtr); - page_cur_search(block, index, entry, PAGE_CUR_LE, &page_cur); +/*********************************************************************//** +Restores insert buffer tree cursor position +@return TRUE if the position was restored; FALSE if not */ +static __attribute__((nonnull)) +ibool +ibuf_restore_pos( +/*=============*/ + ulint space, /*!< in: space id */ + ulint page_no,/*!< in: index page number where the record + should belong */ + const dtuple_t* search_tuple, + /*!< in: search tuple for entries of page_no */ + ulint mode, /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */ + btr_pcur_t* pcur, /*!< in/out: persistent cursor whose + position is to be restored */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ut_ad(mode == BTR_MODIFY_LEAF || mode == BTR_MODIFY_TREE); - /* This time the record must fit */ - if (UNIV_UNLIKELY - (!page_cur_tuple_insert(&page_cur, entry, index, - 0, mtr))) { - ulint space; - ulint page_no; - ulint zip_size; + if (btr_pcur_restore_position(mode, pcur, mtr)) { - ut_print_timestamp(stderr); + return(TRUE); + } - fprintf(stderr, - " InnoDB: Error: Insert buffer insert" - " fails; page free %lu," - " dtuple size %lu\n", - (ulong) page_get_max_insert_size( - page, 1), - (ulong) rec_get_converted_size( - index, entry, 0)); - fputs("InnoDB: Cannot insert index record ", - stderr); - dtuple_print(stderr, entry); - fputs("\nInnoDB: The table where" - " this index record belongs\n" - "InnoDB: is now probably corrupt." - " Please run CHECK TABLE on\n" - "InnoDB: that table.\n", stderr); + if (fil_space_get_flags(space) == ULINT_UNDEFINED) { + /* The tablespace has been dropped. It is possible + that another thread has deleted the insert buffer + entry. Do not complain. */ + ibuf_btr_pcur_commit_specify_mtr(pcur, mtr); + } else { + fprintf(stderr, + "InnoDB: ERROR: Submit the output to" + " http://bugs.mysql.com\n" + "InnoDB: ibuf cursor restoration fails!\n" + "InnoDB: ibuf record inserted to page %lu:%lu\n", + (ulong) space, (ulong) page_no); + fflush(stderr); - space = page_get_space_id(page); - zip_size = buf_block_get_zip_size(block); - page_no = page_get_page_no(page); + rec_print_old(stderr, btr_pcur_get_rec(pcur)); + rec_print_old(stderr, pcur->old_rec); + dtuple_print(stderr, search_tuple); - bitmap_page = ibuf_bitmap_get_map_page( - space, page_no, zip_size, mtr); - old_bits = ibuf_bitmap_page_get_bits( - bitmap_page, page_no, zip_size, - IBUF_BITMAP_FREE, mtr); + rec_print_old(stderr, + page_rec_get_next(btr_pcur_get_rec(pcur))); + fflush(stderr); - fprintf(stderr, - "InnoDB: space %lu, page %lu," - " zip_size %lu, bitmap bits %lu\n", - (ulong) space, (ulong) page_no, - (ulong) zip_size, (ulong) old_bits); + ibuf_btr_pcur_commit_specify_mtr(pcur, mtr); - fputs("InnoDB: Submit a detailed bug report" - " to http://bugs.mysql.com\n", stderr); + fputs("InnoDB: Validating insert buffer tree:\n", stderr); + if (!btr_validate_index(ibuf->index, NULL)) { + ut_error; } + + fprintf(stderr, "InnoDB: ibuf tree ok\n"); + fflush(stderr); } + + return(FALSE); } /*********************************************************************//** @@ -3018,8 +4333,8 @@ ibool ibuf_delete_rec( /*============*/ ulint space, /*!< in: space id */ - ulint page_no,/*!< in: index page number where the record - should belong */ + ulint page_no,/*!< in: index page number that the record + should belong to */ btr_pcur_t* pcur, /*!< in: pcur positioned on the record to delete, having latch mode BTR_MODIFY_LEAF */ const dtuple_t* search_tuple, @@ -3030,14 +4345,30 @@ ibuf_delete_rec( page_t* root; ulint err; - ut_ad(ibuf_inside()); + ut_ad(ibuf_inside(mtr)); ut_ad(page_rec_is_user_rec(btr_pcur_get_rec(pcur))); - ut_ad(ibuf_rec_get_page_no(btr_pcur_get_rec(pcur)) == page_no); - ut_ad(ibuf_rec_get_space(btr_pcur_get_rec(pcur)) == space); + ut_ad(ibuf_rec_get_page_no(mtr, btr_pcur_get_rec(pcur)) == page_no); + ut_ad(ibuf_rec_get_space(mtr, btr_pcur_get_rec(pcur)) == space); success = btr_cur_optimistic_delete(btr_pcur_get_btr_cur(pcur), mtr); if (success) { + if (UNIV_UNLIKELY(!page_get_n_recs(btr_pcur_get_page(pcur)))) { + /* If a B-tree page is empty, it must be the root page + and the whole B-tree must be empty. InnoDB does not + allow empty B-tree pages other than the root. */ + root = btr_pcur_get_page(pcur); + + ut_ad(page_get_space_id(root) == IBUF_SPACE_ID); + ut_ad(page_get_page_no(root) + == FSP_IBUF_TREE_ROOT_PAGE_NO); + + /* ibuf->empty is protected by the root page latch. + Before the deletion, it had to be FALSE. */ + ut_ad(!ibuf->empty); + ibuf->empty = TRUE; + } + #ifdef UNIV_IBUF_COUNT_DEBUG fprintf(stderr, "Decrementing ibuf count of space %lu page %lu\n" @@ -3050,54 +4381,22 @@ ibuf_delete_rec( } ut_ad(page_rec_is_user_rec(btr_pcur_get_rec(pcur))); - ut_ad(ibuf_rec_get_page_no(btr_pcur_get_rec(pcur)) == page_no); - ut_ad(ibuf_rec_get_space(btr_pcur_get_rec(pcur)) == space); + ut_ad(ibuf_rec_get_page_no(mtr, btr_pcur_get_rec(pcur)) == page_no); + ut_ad(ibuf_rec_get_space(mtr, btr_pcur_get_rec(pcur)) == space); /* We have to resort to a pessimistic delete from ibuf */ btr_pcur_store_position(pcur, mtr); + ibuf_btr_pcur_commit_specify_mtr(pcur, mtr); - btr_pcur_commit_specify_mtr(pcur, mtr); - + ibuf_mtr_start(mtr); mutex_enter(&ibuf_mutex); - mtr_start(mtr); - - success = btr_pcur_restore_position(BTR_MODIFY_TREE, pcur, mtr); - - if (!success) { - if (fil_space_get_flags(space) == ULINT_UNDEFINED) { - /* The tablespace has been dropped. It is possible - that another thread has deleted the insert buffer - entry. Do not complain. */ - goto commit_and_exit; - } - - fprintf(stderr, - "InnoDB: ERROR: Submit the output to" - " http://bugs.mysql.com\n" - "InnoDB: ibuf cursor restoration fails!\n" - "InnoDB: ibuf record inserted to page %lu\n", - (ulong) page_no); - fflush(stderr); - - rec_print_old(stderr, btr_pcur_get_rec(pcur)); - rec_print_old(stderr, pcur->old_rec); - dtuple_print(stderr, search_tuple); - - rec_print_old(stderr, - page_rec_get_next(btr_pcur_get_rec(pcur))); - fflush(stderr); - - btr_pcur_commit_specify_mtr(pcur, mtr); - - fputs("InnoDB: Validating insert buffer tree:\n", stderr); - if (!btr_validate_index(ibuf->index, NULL)) { - ut_error; - } - - fprintf(stderr, "InnoDB: ibuf tree ok\n"); - fflush(stderr); + if (!ibuf_restore_pos(space, page_no, search_tuple, + BTR_MODIFY_TREE, pcur, mtr)) { + mutex_exit(&ibuf_mutex); + ut_ad(!ibuf_inside(mtr)); + ut_ad(mtr->state == MTR_COMMITTED); goto func_exit; } @@ -3111,25 +4410,26 @@ ibuf_delete_rec( ibuf_count_set(space, page_no, ibuf_count_get(space, page_no) - 1); #endif ibuf_size_update(root, mtr); + mutex_exit(&ibuf_mutex); -commit_and_exit: - btr_pcur_commit_specify_mtr(pcur, mtr); + ibuf->empty = (page_get_n_recs(root) == 0); + ibuf_btr_pcur_commit_specify_mtr(pcur, mtr); func_exit: + ut_ad(!ibuf_inside(mtr)); + ut_ad(mtr->state == MTR_COMMITTED); btr_pcur_close(pcur); - mutex_exit(&ibuf_mutex); - return(TRUE); } /*********************************************************************//** When an index page is read from a disk to the buffer pool, this function -inserts to the page the possible index entries buffered in the insert buffer. -The entries are deleted from the insert buffer. If the page is not read, but -created in the buffer pool, this function deletes its buffered entries from -the insert buffer; there can exist entries for such a page if the page -belonged to an index which subsequently was dropped. */ +applies any buffered operations to the page and deletes the entries from the +insert buffer. If the page is not read, but created in the buffer pool, this +function deletes its buffered entries from the insert buffer; there can +exist entries for such a page if the page belonged to an index which +subsequently was dropped. */ UNIV_INTERN void ibuf_merge_or_delete_for_page( @@ -3150,15 +4450,18 @@ ibuf_merge_or_delete_for_page( mem_heap_t* heap; btr_pcur_t pcur; dtuple_t* search_tuple; - ulint n_inserts; #ifdef UNIV_IBUF_DEBUG - ulint volume; + ulint volume = 0; #endif page_zip_des_t* page_zip = NULL; ibool tablespace_being_deleted = FALSE; ibool corruption_noticed = FALSE; mtr_t mtr; + /* Counts for merged & discarded operations. */ + ulint mops[IBUF_OP_COUNT]; + ulint dops[IBUF_OP_COUNT]; + ut_ad(!block || buf_block_get_space(block) == space); ut_ad(!block || buf_block_get_page_no(block) == page_no); ut_ad(!block || buf_block_get_zip_size(block) == zip_size); @@ -3206,18 +4509,20 @@ ibuf_merge_or_delete_for_page( update_ibuf_bitmap = FALSE; } else { page_t* bitmap_page; + ulint bitmap_bits; - mtr_start(&mtr); + ibuf_mtr_start(&mtr); bitmap_page = ibuf_bitmap_get_map_page( space, page_no, zip_size, &mtr); + bitmap_bits = ibuf_bitmap_page_get_bits( + bitmap_page, page_no, zip_size, + IBUF_BITMAP_BUFFERED, &mtr); + + ibuf_mtr_commit(&mtr); - if (!ibuf_bitmap_page_get_bits(bitmap_page, page_no, - zip_size, - IBUF_BITMAP_BUFFERED, - &mtr)) { + if (!bitmap_bits) { /* No inserts buffered for this page */ - mtr_commit(&mtr); if (!tablespace_being_deleted) { fil_decr_pending_ibuf_merges(space); @@ -3225,7 +4530,6 @@ ibuf_merge_or_delete_for_page( return; } - mtr_commit(&mtr); } } else if (block && (ibuf_fixed_addr_page(space, zip_size, page_no) @@ -3234,11 +4538,9 @@ ibuf_merge_or_delete_for_page( return; } - ibuf_enter(); - heap = mem_heap_create(512); - if (!trx_sys_multiple_tablespace_format) { + if (UNIV_UNLIKELY(!trx_sys_multiple_tablespace_format)) { ut_a(trx_doublewrite_must_reset_space_ids); search_tuple = ibuf_search_tuple_build(space, page_no, heap); } else { @@ -3265,7 +4567,7 @@ ibuf_merge_or_delete_for_page( ut_print_timestamp(stderr); - mtr_start(&mtr); + ibuf_mtr_start(&mtr); fputs(" InnoDB: Dump of the ibuf bitmap page:\n", stderr); @@ -3273,8 +4575,7 @@ ibuf_merge_or_delete_for_page( bitmap_page = ibuf_bitmap_get_map_page(space, page_no, zip_size, &mtr); buf_page_print(bitmap_page, 0); - - mtr_commit(&mtr); + ibuf_mtr_commit(&mtr); fputs("\nInnoDB: Dump of the page:\n", stderr); @@ -3301,12 +4602,11 @@ ibuf_merge_or_delete_for_page( } } - n_inserts = 0; -#ifdef UNIV_IBUF_DEBUG - volume = 0; -#endif + memset(mops, 0, sizeof(mops)); + memset(dops, 0, sizeof(dops)); + loop: - mtr_start(&mtr); + ibuf_mtr_start(&mtr); if (block) { ibool success; @@ -3340,8 +4640,8 @@ loop: rec = btr_pcur_get_rec(&pcur); /* Check if the entry is for this index page */ - if (ibuf_rec_get_page_no(rec) != page_no - || ibuf_rec_get_space(rec) != space) { + if (ibuf_rec_get_page_no(&mtr, rec) != page_no + || ibuf_rec_get_space(&mtr, rec) != space) { if (block) { page_header_reset_last_insert( @@ -3357,33 +4657,96 @@ loop: fputs("\nInnoDB: from the insert buffer!\n\n", stderr); } else if (block) { /* Now we have at pcur a record which should be - inserted to the index page; NOTE that the call below + applied on the index page; NOTE that the call below copies pointers to fields in rec, and we must keep the latch to the rec page until the insertion is finished! */ dtuple_t* entry; trx_id_t max_trx_id; dict_index_t* dummy_index; + ibuf_op_t op = ibuf_rec_get_op_type(&mtr, rec); max_trx_id = page_get_max_trx_id(page_align(rec)); page_update_max_trx_id(block, page_zip, max_trx_id, &mtr); + ut_ad(page_validate(page_align(rec), ibuf->index)); + entry = ibuf_build_entry_from_ibuf_rec( - rec, heap, &dummy_index); + &mtr, rec, heap, &dummy_index); + + ut_ad(page_validate(block->frame, dummy_index)); + + switch (op) { + ibool success; + case IBUF_OP_INSERT: #ifdef UNIV_IBUF_DEBUG - volume += rec_get_converted_size(dummy_index, entry, 0) - + page_dir_calc_reserved_space(1); - ut_a(volume <= 4 * UNIV_PAGE_SIZE - / IBUF_PAGE_SIZE_PER_FREE_SPACE); + volume += rec_get_converted_size( + dummy_index, entry, 0); + + volume += page_dir_calc_reserved_space(1); + + ut_a(volume <= 4 * UNIV_PAGE_SIZE + / IBUF_PAGE_SIZE_PER_FREE_SPACE); #endif - ibuf_insert_to_index_page(entry, block, - dummy_index, &mtr); + ibuf_insert_to_index_page( + entry, block, dummy_index, &mtr); + break; + + case IBUF_OP_DELETE_MARK: + ibuf_set_del_mark( + entry, block, dummy_index, &mtr); + break; + + case IBUF_OP_DELETE: + ibuf_delete(entry, block, dummy_index, &mtr); + /* Because ibuf_delete() will latch an + insert buffer bitmap page, commit mtr + before latching any further pages. + Store and restore the cursor position. */ + ut_ad(rec == btr_pcur_get_rec(&pcur)); + ut_ad(page_rec_is_user_rec(rec)); + ut_ad(ibuf_rec_get_page_no(&mtr, rec) + == page_no); + ut_ad(ibuf_rec_get_space(&mtr, rec) == space); + + btr_pcur_store_position(&pcur, &mtr); + ibuf_btr_pcur_commit_specify_mtr(&pcur, &mtr); + + ibuf_mtr_start(&mtr); + + success = buf_page_get_known_nowait( + RW_X_LATCH, block, + BUF_KEEP_OLD, + __FILE__, __LINE__, &mtr); + ut_a(success); + + buf_block_dbg_add_level(block, SYNC_TREE_NODE); + + if (!ibuf_restore_pos(space, page_no, + search_tuple, + BTR_MODIFY_LEAF, + &pcur, &mtr)) { + + ut_ad(!ibuf_inside(&mtr)); + ut_ad(mtr.state == MTR_COMMITTED); + mops[op]++; + ibuf_dummy_index_free(dummy_index); + goto loop; + } + + break; + default: + ut_error; + } + + mops[op]++; + ibuf_dummy_index_free(dummy_index); + } else { + dops[ibuf_rec_get_op_type(&mtr, rec)]++; } - n_inserts++; - /* Delete the record from ibuf */ if (ibuf_delete_rec(space, page_no, &pcur, search_tuple, &mtr)) { @@ -3392,7 +4755,7 @@ loop: goto loop; } else if (btr_pcur_is_after_last_on_page(&pcur)) { - mtr_commit(&mtr); + ibuf_mtr_commit(&mtr); btr_pcur_close(&pcur); goto loop; @@ -3400,12 +4763,6 @@ loop: } reset_bit: -#ifdef UNIV_IBUF_COUNT_DEBUG - if (ibuf_count_get(space, page_no) > 0) { - /* btr_print_tree(ibuf_data->index->tree, 100); - ibuf_print(); */ - } -#endif if (UNIV_LIKELY(update_ibuf_bitmap)) { page_t* bitmap_page; @@ -3432,25 +4789,30 @@ reset_bit: } } - mtr_commit(&mtr); + ibuf_mtr_commit(&mtr); btr_pcur_close(&pcur); mem_heap_free(heap); +#ifdef HAVE_ATOMIC_BUILTINS + os_atomic_increment_ulint(&ibuf->n_merges, 1); + ibuf_add_ops(ibuf->n_merged_ops, mops); + ibuf_add_ops(ibuf->n_discarded_ops, dops); +#else /* HAVE_ATOMIC_BUILTINS */ /* Protect our statistics keeping from race conditions */ mutex_enter(&ibuf_mutex); ibuf->n_merges++; - ibuf->n_merged_recs += n_inserts; + ibuf_add_ops(ibuf->n_merged_ops, mops); + ibuf_add_ops(ibuf->n_discarded_ops, dops); mutex_exit(&ibuf_mutex); +#endif /* HAVE_ATOMIC_BUILTINS */ if (update_ibuf_bitmap && !tablespace_being_deleted) { fil_decr_pending_ibuf_merges(space); } - ibuf_exit(); - #ifdef UNIV_IBUF_COUNT_DEBUG ut_a(ibuf_count_get(space, page_no) == 0); #endif @@ -3470,12 +4832,13 @@ ibuf_delete_for_discarded_space( mem_heap_t* heap; btr_pcur_t pcur; dtuple_t* search_tuple; - rec_t* ibuf_rec; + const rec_t* ibuf_rec; ulint page_no; - ibool closed; - ulint n_inserts; mtr_t mtr; + /* Counts for discarded operations. */ + ulint dops[IBUF_OP_COUNT]; + heap = mem_heap_create(512); /* Use page number 0 to build the search tuple so that we get the @@ -3483,11 +4846,9 @@ ibuf_delete_for_discarded_space( search_tuple = ibuf_new_search_tuple_build(space, 0, heap); - n_inserts = 0; + memset(dops, 0, sizeof(dops)); loop: - ibuf_enter(); - - mtr_start(&mtr); + ibuf_mtr_start(&mtr); /* Position pcur in the insert buffer at the first entry for the space */ @@ -3507,50 +4868,44 @@ loop: ibuf_rec = btr_pcur_get_rec(&pcur); /* Check if the entry is for this space */ - if (ibuf_rec_get_space(ibuf_rec) != space) { + if (ibuf_rec_get_space(&mtr, ibuf_rec) != space) { goto leave_loop; } - page_no = ibuf_rec_get_page_no(ibuf_rec); + page_no = ibuf_rec_get_page_no(&mtr, ibuf_rec); - n_inserts++; + dops[ibuf_rec_get_op_type(&mtr, ibuf_rec)]++; /* Delete the record from ibuf */ - closed = ibuf_delete_rec(space, page_no, &pcur, search_tuple, - &mtr); - if (closed) { + if (ibuf_delete_rec(space, page_no, &pcur, search_tuple, + &mtr)) { /* Deletion was pessimistic and mtr was committed: we start from the beginning again */ - ibuf_exit(); - goto loop; } if (btr_pcur_is_after_last_on_page(&pcur)) { - mtr_commit(&mtr); + ibuf_mtr_commit(&mtr); btr_pcur_close(&pcur); - ibuf_exit(); - goto loop; } } leave_loop: - mtr_commit(&mtr); + ibuf_mtr_commit(&mtr); btr_pcur_close(&pcur); +#ifdef HAVE_ATOMIC_BUILTINS + ibuf_add_ops(ibuf->n_discarded_ops, dops); +#else /* HAVE_ATOMIC_BUILTINS */ /* Protect our statistics keeping from race conditions */ mutex_enter(&ibuf_mutex); - - ibuf->n_merges++; - ibuf->n_merged_recs += n_inserts; - + ibuf_add_ops(ibuf->n_discarded_ops, dops); mutex_exit(&ibuf_mutex); - - ibuf_exit(); +#endif /* HAVE_ATOMIC_BUILTINS */ mem_heap_free(heap); } @@ -3567,37 +4922,15 @@ ibuf_is_empty(void) const page_t* root; mtr_t mtr; - ibuf_enter(); + ibuf_mtr_start(&mtr); mutex_enter(&ibuf_mutex); - - mtr_start(&mtr); - root = ibuf_tree_root_get(&mtr); - - if (page_get_n_recs(root) == 0) { - - is_empty = TRUE; - - if (ibuf->empty == FALSE) { - fprintf(stderr, - "InnoDB: Warning: insert buffer tree is empty" - " but the data struct does not\n" - "InnoDB: know it. This condition is legal" - " if the master thread has not yet\n" - "InnoDB: run to completion.\n"); - } - } else { - ut_a(ibuf->empty == FALSE); - - is_empty = FALSE; - } - - mtr_commit(&mtr); - mutex_exit(&ibuf_mutex); - ibuf_exit(); + is_empty = (page_get_n_recs(root) == 0); + ut_a(is_empty == ibuf->empty); + ibuf_mtr_commit(&mtr); return(is_empty); } @@ -3618,14 +4951,19 @@ ibuf_print( mutex_enter(&ibuf_mutex); fprintf(file, - "Ibuf: size %lu, free list len %lu, seg size %lu,\n" - "%lu inserts, %lu merged recs, %lu merges\n", + "Ibuf: size %lu, free list len %lu," + " seg size %lu, %lu merges\n", (ulong) ibuf->size, (ulong) ibuf->free_list_len, (ulong) ibuf->seg_size, - (ulong) ibuf->n_inserts, - (ulong) ibuf->n_merged_recs, (ulong) ibuf->n_merges); + + fputs("merged operations:\n ", file); + ibuf_print_ops(ibuf->n_merged_ops, file); + + fputs("discarded operations:\n ", file); + ibuf_print_ops(ibuf->n_discarded_ops, file); + #ifdef UNIV_IBUF_COUNT_DEBUG for (i = 0; i < IBUF_COUNT_N_SPACES; i++) { for (j = 0; j < IBUF_COUNT_N_PAGES; j++) { |