diff options
author | Marko Mäkelä <marko.makela@oracle.com> | 2011-10-26 11:44:28 +0300 |
---|---|---|
committer | Marko Mäkelä <marko.makela@oracle.com> | 2011-10-26 11:44:28 +0300 |
commit | b36da66bae6cdbb60bbfdd26da7403febce370eb (patch) | |
tree | 644aa74ed246e7c4530392e1c9e24800dbfdfdc9 /storage | |
parent | e27623a76ceaa372caaffb8c7faac503e1ed5035 (diff) | |
download | mariadb-git-b36da66bae6cdbb60bbfdd26da7403febce370eb.tar.gz |
Revert most of revno 3560.9.1 (Bug#12704861)
This was an attempt to address problems with the Bug#12612184 fix.
Even with this follow-up fix, crash recovery can be broken.
Let us fix the bug later.
Diffstat (limited to 'storage')
27 files changed, 415 insertions, 972 deletions
diff --git a/storage/innobase/btr/btr0btr.c b/storage/innobase/btr/btr0btr.c index ad99913cf3b..e8e065a3116 100644 --- a/storage/innobase/btr/btr0btr.c +++ b/storage/innobase/btr/btr0btr.c @@ -300,30 +300,29 @@ btr_page_alloc_for_ibuf( /****************************************************************** Allocates a new file page to be used in an index tree. NOTE: we assume that the caller has made the reservation for free extents! */ -static -ulint -btr_page_alloc_low( -/*===============*/ - /* out: allocated page number, - FIL_NULL if out of space */ + +page_t* +btr_page_alloc( +/*===========*/ + /* out: new allocated page, x-latched; + NULL if out of space */ dict_index_t* index, /* in: index */ ulint hint_page_no, /* in: hint of a good page */ byte file_direction, /* in: direction where a possible page split is made */ ulint level, /* in: level where the page is placed in the tree */ - mtr_t* mtr, /* in/out: mini-transaction - for the allocation */ - mtr_t* init_mtr) /* in/out: mini-transaction - in which the page should be - initialized (may be the same - as mtr), or NULL if it should - not be initialized (the page - at hint was previously freed - in mtr) */ + mtr_t* mtr) /* in: mtr */ { fseg_header_t* seg_header; page_t* root; + page_t* new_page; + ulint new_page_no; + + if (index->type & DICT_IBUF) { + + return(btr_page_alloc_for_ibuf(index, mtr)); + } root = btr_root_get(index, mtr); @@ -337,61 +336,19 @@ btr_page_alloc_low( reservation for free extents, and thus we know that a page can be allocated: */ - return(fseg_alloc_free_page_general(seg_header, hint_page_no, - file_direction, TRUE, - mtr, init_mtr)); -} - -/**************************************************************//** -Allocates a new file page to be used in an index tree. NOTE: we assume -that the caller has made the reservation for free extents! */ - -page_t* -btr_page_alloc( -/*===========*/ - /* out: new allocated block, x-latched; - NULL if out of space */ - dict_index_t* index, /* in: index */ - ulint hint_page_no, /* in: hint of a good page */ - byte file_direction, /* in: direction where a possible - page split is made */ - ulint level, /* in: level where the page is placed - in the tree */ - mtr_t* mtr, /* in/out: mini-transaction - for the allocation */ - mtr_t* init_mtr) /* in/out: mini-transaction - for x-latching and initializing - the page */ -{ - page_t* new_page; - ulint new_page_no; - - if (index->type & DICT_IBUF) { - - return(btr_page_alloc_for_ibuf(index, mtr)); - } - - new_page_no = btr_page_alloc_low( - index, hint_page_no, file_direction, level, mtr, init_mtr); - + new_page_no = fseg_alloc_free_page_general(seg_header, hint_page_no, + file_direction, TRUE, mtr); if (new_page_no == FIL_NULL) { return(NULL); } new_page = buf_page_get(dict_index_get_space(index), new_page_no, - RW_X_LATCH, init_mtr); + RW_X_LATCH, mtr); #ifdef UNIV_SYNC_DEBUG buf_page_dbg_add_level(new_page, SYNC_TREE_NODE_NEW); #endif /* UNIV_SYNC_DEBUG */ - if (mtr->freed_clust_leaf) { - mtr_memo_release(mtr, new_page, MTR_MEMO_FREE_CLUST_LEAF); - ut_ad(!mtr_memo_contains(mtr, buf_block_align(new_page), - MTR_MEMO_FREE_CLUST_LEAF)); - } - - ut_ad(btr_freed_leaves_validate(mtr)); return(new_page); } @@ -538,138 +495,8 @@ btr_page_free( level = btr_page_get_level(page, mtr); btr_page_free_low(index, page, level, mtr); - - /* The handling of MTR_MEMO_FREE_CLUST_LEAF assumes this. */ - ut_ad(mtr_memo_contains(mtr, buf_block_align(page), - MTR_MEMO_PAGE_X_FIX)); - - if (level == 0 && (index->type & DICT_CLUSTERED)) { - /* We may have to call btr_mark_freed_leaves() to - temporarily mark the block nonfree for invoking - btr_store_big_rec_extern_fields() after an - update. Remember that the block was freed. */ - mtr->freed_clust_leaf = TRUE; - mtr_memo_push(mtr, buf_block_align(page), - MTR_MEMO_FREE_CLUST_LEAF); - } - - ut_ad(btr_freed_leaves_validate(mtr)); } -/**************************************************************//** -Marks all MTR_MEMO_FREE_CLUST_LEAF pages nonfree or free. -For invoking btr_store_big_rec_extern_fields() after an update, -we must temporarily mark freed clustered index pages allocated, so -that off-page columns will not be allocated from them. Between the -btr_store_big_rec_extern_fields() and mtr_commit() we have to -mark the pages free again, so that no pages will be leaked. */ - -void -btr_mark_freed_leaves( -/*==================*/ - dict_index_t* index, /* in/out: clustered index */ - mtr_t* mtr, /* in/out: mini-transaction */ - ibool nonfree)/* in: TRUE=mark nonfree, FALSE=mark freed */ -{ - /* This is loosely based on mtr_memo_release(). */ - - ulint offset; - - ut_ad(index->type & DICT_CLUSTERED); - ut_ad(mtr->magic_n == MTR_MAGIC_N); - ut_ad(mtr->state == MTR_ACTIVE); - - if (!mtr->freed_clust_leaf) { - return; - } - - offset = dyn_array_get_data_size(&mtr->memo); - - while (offset > 0) { - mtr_memo_slot_t* slot; - buf_block_t* block; - - offset -= sizeof *slot; - - slot = dyn_array_get_element(&mtr->memo, offset); - - if (slot->type != MTR_MEMO_FREE_CLUST_LEAF) { - continue; - } - - /* Because btr_page_alloc() does invoke - mtr_memo_release on MTR_MEMO_FREE_CLUST_LEAF, all - blocks tagged with MTR_MEMO_FREE_CLUST_LEAF in the - memo must still be clustered index leaf tree pages. */ - block = slot->object; - ut_a(buf_block_get_space(block) - == dict_index_get_space(index)); - ut_a(fil_page_get_type(buf_block_get_frame(block)) - == FIL_PAGE_INDEX); - ut_a(btr_page_get_level(buf_block_get_frame(block), mtr) == 0); - - if (nonfree) { - /* Allocate the same page again. */ - ulint page_no; - page_no = btr_page_alloc_low( - index, buf_block_get_page_no(block), - FSP_NO_DIR, 0, mtr, NULL); - ut_a(page_no == buf_block_get_page_no(block)); - } else { - /* Assert that the page is allocated and free it. */ - btr_page_free_low(index, buf_block_get_frame(block), - 0, mtr); - } - } - - ut_ad(btr_freed_leaves_validate(mtr)); -} - -#ifdef UNIV_DEBUG -/**************************************************************//** -Validates all pages marked MTR_MEMO_FREE_CLUST_LEAF. -See btr_mark_freed_leaves(). */ - -ibool -btr_freed_leaves_validate( -/*======================*/ - /* out: TRUE if valid */ - mtr_t* mtr) /* in: mini-transaction */ -{ - ulint offset; - - ut_ad(mtr->magic_n == MTR_MAGIC_N); - ut_ad(mtr->state == MTR_ACTIVE); - - offset = dyn_array_get_data_size(&mtr->memo); - - while (offset > 0) { - mtr_memo_slot_t* slot; - buf_block_t* block; - - offset -= sizeof *slot; - - slot = dyn_array_get_element(&mtr->memo, offset); - - if (slot->type != MTR_MEMO_FREE_CLUST_LEAF) { - continue; - } - - ut_a(mtr->freed_clust_leaf); - /* Because btr_page_alloc() does invoke - mtr_memo_release on MTR_MEMO_FREE_CLUST_LEAF, all - blocks tagged with MTR_MEMO_FREE_CLUST_LEAF in the - memo must still be clustered index leaf tree pages. */ - block = slot->object; - ut_a(fil_page_get_type(buf_block_get_frame(block)) - == FIL_PAGE_INDEX); - ut_a(btr_page_get_level(buf_block_get_frame(block), mtr) == 0); - } - - return(TRUE); -} -#endif /* UNIV_DEBUG */ - /****************************************************************** Sets the child node file address in a node pointer. */ UNIV_INLINE @@ -1199,7 +1026,7 @@ btr_root_raise_and_insert( a node pointer to the new page, and then splitting the new page. */ new_page = btr_page_alloc(index, 0, FSP_NO_DIR, - btr_page_get_level(root, mtr), mtr, mtr); + btr_page_get_level(root, mtr), mtr); btr_page_create(new_page, index, mtr); @@ -1820,7 +1647,7 @@ func_start: /* 2. Allocate a new page to the index */ new_page = btr_page_alloc(cursor->index, hint_page_no, direction, - btr_page_get_level(page, mtr), mtr, mtr); + btr_page_get_level(page, mtr), mtr); btr_page_create(new_page, cursor->index, mtr); /* 3. Calculate the first record on the upper half-page, and the diff --git a/storage/innobase/btr/btr0cur.c b/storage/innobase/btr/btr0cur.c index 37bb3188785..95d87344e93 100644 --- a/storage/innobase/btr/btr0cur.c +++ b/storage/innobase/btr/btr0cur.c @@ -2051,6 +2051,43 @@ return_after_reservations: return(err); } +/***************************************************************** +Commits and restarts a mini-transaction so that it will retain an +x-lock on index->lock and the cursor page. */ + +void +btr_cur_mtr_commit_and_start( +/*=========================*/ + btr_cur_t* cursor, /* in: cursor */ + mtr_t* mtr) /* in/out: mini-transaction */ +{ + buf_block_t* block; + + block = buf_block_align(btr_cur_get_rec(cursor)); + + ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(cursor->index), + MTR_MEMO_X_LOCK)); + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + /* Keep the locks across the mtr_commit(mtr). */ + rw_lock_x_lock(dict_index_get_lock(cursor->index)); + rw_lock_x_lock(&block->lock); + mutex_enter(&block->mutex); +#ifdef UNIV_SYNC_DEBUG + buf_block_buf_fix_inc_debug(block, __FILE__, __LINE__); +#else + buf_block_buf_fix_inc(block); +#endif + mutex_exit(&block->mutex); + /* Write out the redo log. */ + mtr_commit(mtr); + mtr_start(mtr); + /* Reassociate the locks with the mini-transaction. + They will be released on mtr_commit(mtr). */ + mtr_memo_push(mtr, dict_index_get_lock(cursor->index), + MTR_MEMO_X_LOCK); + mtr_memo_push(mtr, block, MTR_MEMO_PAGE_X_FIX); +} + /*==================== B-TREE DELETE MARK AND UNMARK ===============*/ /******************************************************************** @@ -3449,11 +3486,6 @@ btr_store_big_rec_extern_fields( this function returns */ big_rec_t* big_rec_vec, /* in: vector containing fields to be stored externally */ - mtr_t* alloc_mtr, /* in/out: in an insert, NULL; - in an update, local_mtr for - allocating BLOB pages and - updating BLOB pointers; alloc_mtr - must not have freed any leaf pages */ mtr_t* local_mtr __attribute__((unused))) /* in: mtr containing the latch to rec and to the tree */ @@ -3474,8 +3506,6 @@ btr_store_big_rec_extern_fields( ulint i; mtr_t mtr; - ut_ad(local_mtr); - ut_ad(!alloc_mtr || alloc_mtr == local_mtr); ut_ad(rec_offs_validate(rec, index, offsets)); ut_ad(mtr_memo_contains(local_mtr, dict_index_get_lock(index), MTR_MEMO_X_LOCK)); @@ -3485,25 +3515,6 @@ btr_store_big_rec_extern_fields( space_id = buf_frame_get_space_id(rec); - if (alloc_mtr) { - /* Because alloc_mtr will be committed after - mtr, it is possible that the tablespace has been - extended when the B-tree record was updated or - inserted, or it will be extended while allocating - pages for big_rec. - - TODO: In mtr (not alloc_mtr), write a redo log record - about extending the tablespace to its current size, - and remember the current size. Whenever the tablespace - grows as pages are allocated, write further redo log - records to mtr. (Currently tablespace extension is not - covered by the redo log. If it were, the record would - only be written to alloc_mtr, which is committed after - mtr.) */ - } else { - alloc_mtr = &mtr; - } - /* We have to create a file segment to the tablespace for each field and put the pointer to the field in rec */ @@ -3530,7 +3541,7 @@ btr_store_big_rec_extern_fields( } page = btr_page_alloc(index, hint_page_no, - FSP_NO_DIR, 0, alloc_mtr, &mtr); + FSP_NO_DIR, 0, &mtr); if (page == NULL) { mtr_commit(&mtr); @@ -3584,42 +3595,37 @@ btr_store_big_rec_extern_fields( extern_len -= store_len; - if (alloc_mtr == &mtr) { #ifdef UNIV_SYNC_DEBUG - rec_page = + rec_page = #endif /* UNIV_SYNC_DEBUG */ - buf_page_get( - space_id, - buf_frame_get_page_no(data), - RW_X_LATCH, &mtr); + buf_page_get(space_id, + buf_frame_get_page_no(data), + RW_X_LATCH, &mtr); #ifdef UNIV_SYNC_DEBUG - buf_page_dbg_add_level( - rec_page, SYNC_NO_ORDER_CHECK); + buf_page_dbg_add_level(rec_page, SYNC_NO_ORDER_CHECK); #endif /* UNIV_SYNC_DEBUG */ - } - mlog_write_ulint(data + local_len + BTR_EXTERN_LEN, 0, - MLOG_4BYTES, alloc_mtr); + MLOG_4BYTES, &mtr); mlog_write_ulint(data + local_len + BTR_EXTERN_LEN + 4, big_rec_vec->fields[i].len - extern_len, - MLOG_4BYTES, alloc_mtr); + MLOG_4BYTES, &mtr); if (prev_page_no == FIL_NULL) { mlog_write_ulint(data + local_len + BTR_EXTERN_SPACE_ID, space_id, - MLOG_4BYTES, alloc_mtr); + MLOG_4BYTES, &mtr); mlog_write_ulint(data + local_len + BTR_EXTERN_PAGE_NO, page_no, - MLOG_4BYTES, alloc_mtr); + MLOG_4BYTES, &mtr); mlog_write_ulint(data + local_len + BTR_EXTERN_OFFSET, FIL_PAGE_DATA, - MLOG_4BYTES, alloc_mtr); + MLOG_4BYTES, &mtr); /* Set the bit denoting that this field in rec is stored externally */ @@ -3627,7 +3633,7 @@ btr_store_big_rec_extern_fields( rec_set_nth_field_extern_bit( rec, index, big_rec_vec->fields[i].field_no, - TRUE, alloc_mtr); + TRUE, &mtr); } prev_page_no = page_no; diff --git a/storage/innobase/fsp/fsp0fsp.c b/storage/innobase/fsp/fsp0fsp.c index d5be8fca38f..90e6ad34a9a 100644 --- a/storage/innobase/fsp/fsp0fsp.c +++ b/storage/innobase/fsp/fsp0fsp.c @@ -300,12 +300,8 @@ fseg_alloc_free_page_low( inserted there in order, into which direction they go alphabetically: FSP_DOWN, FSP_UP, FSP_NO_DIR */ - mtr_t* mtr, /* in/out: mini-transaction */ - mtr_t* init_mtr);/* in/out: mini-transaction in which the - page should be initialized - (may be the same as mtr), or NULL if it - should not be initialized (the page at hint - was previously freed in mtr) */ + mtr_t* mtr); /* in/out: mini-transaction */ + /************************************************************************** Reads the file space size stored in the header page. */ @@ -1375,43 +1371,6 @@ fsp_alloc_free_extent( return(descr); } -/**********************************************************************//** -Allocates a single free page from a space. */ -static __attribute__((nonnull)) -void -fsp_alloc_from_free_frag( -/*=====================*/ - fsp_header_t* header, /* in/out: tablespace header */ - xdes_t* descr, /* in/out: extent descriptor */ - ulint bit, /* in: slot to allocate in the extent */ - mtr_t* mtr) /* in/out: mini-transaction */ -{ - ulint frag_n_used; - - ut_ad(xdes_get_state(descr, mtr) == XDES_FREE_FRAG); - ut_a(xdes_get_bit(descr, XDES_FREE_BIT, bit, mtr)); - xdes_set_bit(descr, XDES_FREE_BIT, bit, FALSE, mtr); - - /* Update the FRAG_N_USED field */ - frag_n_used = mtr_read_ulint(header + FSP_FRAG_N_USED, MLOG_4BYTES, - mtr); - frag_n_used++; - mlog_write_ulint(header + FSP_FRAG_N_USED, frag_n_used, MLOG_4BYTES, - mtr); - if (xdes_is_full(descr, mtr)) { - /* The fragment is full: move it to another list */ - flst_remove(header + FSP_FREE_FRAG, descr + XDES_FLST_NODE, - mtr); - xdes_set_state(descr, XDES_FULL_FRAG, mtr); - - flst_add_last(header + FSP_FULL_FRAG, descr + XDES_FLST_NODE, - mtr); - mlog_write_ulint(header + FSP_FRAG_N_USED, - frag_n_used - FSP_EXTENT_SIZE, MLOG_4BYTES, - mtr); - } -} - /************************************************************************** Allocates a single free page from a space. The page is marked as used. */ static @@ -1422,22 +1381,19 @@ fsp_alloc_free_page( be allocated */ ulint space, /* in: space id */ ulint hint, /* in: hint of which page would be desirable */ - mtr_t* mtr, /* in/out: mini-transaction */ - mtr_t* init_mtr)/* in/out: mini-transaction in which the - page should be initialized - (may be the same as mtr) */ + mtr_t* mtr) /* in/out: mini-transaction */ { fsp_header_t* header; fil_addr_t first; xdes_t* descr; page_t* page; ulint free; + ulint frag_n_used; ulint page_no; ulint space_size; ibool success; ut_ad(mtr); - ut_ad(init_mtr); header = fsp_get_space_header(space, mtr); @@ -1517,21 +1473,40 @@ fsp_alloc_free_page( } } - fsp_alloc_from_free_frag(header, descr, free, mtr); + xdes_set_bit(descr, XDES_FREE_BIT, free, FALSE, mtr); + + /* Update the FRAG_N_USED field */ + frag_n_used = mtr_read_ulint(header + FSP_FRAG_N_USED, MLOG_4BYTES, + mtr); + frag_n_used++; + mlog_write_ulint(header + FSP_FRAG_N_USED, frag_n_used, MLOG_4BYTES, + mtr); + if (xdes_is_full(descr, mtr)) { + /* The fragment is full: move it to another list */ + flst_remove(header + FSP_FREE_FRAG, descr + XDES_FLST_NODE, + mtr); + xdes_set_state(descr, XDES_FULL_FRAG, mtr); + + flst_add_last(header + FSP_FULL_FRAG, descr + XDES_FLST_NODE, + mtr); + mlog_write_ulint(header + FSP_FRAG_N_USED, + frag_n_used - FSP_EXTENT_SIZE, MLOG_4BYTES, + mtr); + } /* Initialize the allocated page to the buffer pool, so that it can be obtained immediately with buf_page_get without need for a disk read. */ - buf_page_create(space, page_no, init_mtr); + buf_page_create(space, page_no, mtr); - page = buf_page_get(space, page_no, RW_X_LATCH, init_mtr); + page = buf_page_get(space, page_no, RW_X_LATCH, mtr); #ifdef UNIV_SYNC_DEBUG buf_page_dbg_add_level(page, SYNC_FSP_PAGE); #endif /* UNIV_SYNC_DEBUG */ /* Prior contents of the page should be ignored */ - fsp_init_file_page(page, init_mtr); + fsp_init_file_page(page, mtr); return(page_no); } @@ -1750,7 +1725,7 @@ fsp_alloc_seg_inode_page( space = buf_frame_get_space_id(space_header); - page_no = fsp_alloc_free_page(space, 0, mtr, mtr); + page_no = fsp_alloc_free_page(space, 0, mtr); if (page_no == FIL_NULL) { @@ -2120,8 +2095,7 @@ fseg_create_general( } if (page == 0) { - page = fseg_alloc_free_page_low(space, - inode, 0, FSP_UP, mtr, mtr); + page = fseg_alloc_free_page_low(space, inode, 0, FSP_UP, mtr); if (page == FIL_NULL) { @@ -2365,12 +2339,7 @@ fseg_alloc_free_page_low( inserted there in order, into which direction they go alphabetically: FSP_DOWN, FSP_UP, FSP_NO_DIR */ - mtr_t* mtr, /* in/out: mini-transaction */ - mtr_t* init_mtr)/* in/out: mini-transaction in which the - page should be initialized - (may be the same as mtr), or NULL if it - should not be initialized (the page at hint - was previously freed in mtr) */ + mtr_t* mtr) /* in/out: mini-transaction */ { fsp_header_t* space_header; ulint space_size; @@ -2382,6 +2351,7 @@ fseg_alloc_free_page_low( if could not be allocated */ xdes_t* ret_descr; /* the extent of the allocated page */ page_t* page; + ibool frag_page_allocated = FALSE; ibool success; ulint n; @@ -2402,8 +2372,6 @@ fseg_alloc_free_page_low( if (descr == NULL) { /* Hint outside space or too high above free limit: reset hint */ - ut_a(init_mtr); - /* The file space header page is always allocated. */ hint = 0; descr = xdes_get_descriptor(space, hint, mtr); } @@ -2415,20 +2383,15 @@ fseg_alloc_free_page_low( mtr), seg_id)) && (xdes_get_bit(descr, XDES_FREE_BIT, hint % FSP_EXTENT_SIZE, mtr) == TRUE)) { -take_hinted_page: + /* 1. We can take the hinted page =================================*/ ret_descr = descr; ret_page = hint; - /* Skip the check for extending the tablespace. If the - page hint were not within the size of the tablespace, - we would have got (descr == NULL) above and reset the hint. */ - goto got_hinted_page; /*-----------------------------------------------------------*/ - } else if (xdes_get_state(descr, mtr) == XDES_FREE - && (!init_mtr - || ((reserved - used < reserved / FSEG_FILLFACTOR) - && used >= FSEG_FRAG_LIMIT))) { + } else if ((xdes_get_state(descr, mtr) == XDES_FREE) + && ((reserved - used) < reserved / FSEG_FILLFACTOR) + && (used >= FSEG_FRAG_LIMIT)) { /* 2. We allocate the free extent from space and can take ========================================================= @@ -2446,20 +2409,8 @@ take_hinted_page: /* Try to fill the segment free list */ fseg_fill_free_list(seg_inode, space, hint + FSP_EXTENT_SIZE, mtr); - goto take_hinted_page; - /*-----------------------------------------------------------*/ - } else if (!init_mtr) { - ut_a(xdes_get_state(descr, mtr) == XDES_FREE_FRAG); - fsp_alloc_from_free_frag(space_header, descr, - hint % FSP_EXTENT_SIZE, mtr); ret_page = hint; - ret_descr = NULL; - - /* Put the page in the fragment page array of the segment */ - n = fseg_find_free_frag_page_slot(seg_inode, mtr); - ut_a(n != FIL_NULL); - fseg_set_nth_frag_page_no(seg_inode, n, ret_page, mtr); - goto got_hinted_page; + /*-----------------------------------------------------------*/ } else if ((direction != FSP_NO_DIR) && ((reserved - used) < reserved / FSEG_FILLFACTOR) && (used >= FSEG_FRAG_LIMIT) @@ -2517,9 +2468,11 @@ take_hinted_page: } else if (used < FSEG_FRAG_LIMIT) { /* 6. We allocate an individual page from the space ===================================================*/ - ret_page = fsp_alloc_free_page(space, hint, mtr, init_mtr); + ret_page = fsp_alloc_free_page(space, hint, mtr); ret_descr = NULL; + frag_page_allocated = TRUE; + if (ret_page != FIL_NULL) { /* Put the page in the fragment page array of the segment */ @@ -2529,10 +2482,6 @@ take_hinted_page: fseg_set_nth_frag_page_no(seg_inode, n, ret_page, mtr); } - - /* fsp_alloc_free_page() invoked fsp_init_file_page() - already. */ - return(ret_page); /*-----------------------------------------------------------*/ } else { /* 7. We allocate a new extent and take its first page @@ -2579,31 +2528,22 @@ take_hinted_page: } } -got_hinted_page: - { + if (!frag_page_allocated) { /* Initialize the allocated page to buffer pool, so that it can be obtained immediately with buf_page_get without need for a disk read */ - mtr_t* block_mtr = init_mtr ? init_mtr : mtr; - page = buf_page_create(space, ret_page, block_mtr); + page = buf_page_create(space, ret_page, mtr); - ut_a(page == buf_page_get(space, ret_page, RW_X_LATCH, - block_mtr)); + ut_a(page == buf_page_get(space, ret_page, RW_X_LATCH, mtr)); #ifdef UNIV_SYNC_DEBUG buf_page_dbg_add_level(page, SYNC_FSP_PAGE); #endif /* UNIV_SYNC_DEBUG */ - if (init_mtr) { - /* The prior contents of the page should be ignored */ - fsp_init_file_page(page, init_mtr); - } - } + /* The prior contents of the page should be ignored */ + fsp_init_file_page(page, mtr); - /* ret_descr == NULL if the block was allocated from free_frag - (XDES_FREE_FRAG) */ - if (ret_descr != NULL) { /* At this point we know the extent and the page offset. The extent is still in the appropriate list (FSEG_NOT_FULL or FSEG_FREE), and the page is not yet marked as used. */ @@ -2640,11 +2580,7 @@ fseg_alloc_free_page_general( with fsp_reserve_free_extents, then there is no need to do the check for this individual page */ - mtr_t* mtr, /* in/out: mini-transaction handle */ - mtr_t* init_mtr)/* in/out: mtr or another mini-transaction - in which the page should be initialized, - or NULL if this is a "fake allocation" of - a page that was previously freed in mtr */ + mtr_t* mtr) /* in/out: mini-transaction */ { fseg_inode_t* inode; ulint space; @@ -2682,8 +2618,7 @@ fseg_alloc_free_page_general( } page_no = fseg_alloc_free_page_low(buf_frame_get_space_id(inode), - inode, hint, direction, - mtr, init_mtr); + inode, hint, direction, mtr); if (!has_done_reservation) { fil_space_release_free_extents(space, n_reserved); } @@ -2711,7 +2646,7 @@ fseg_alloc_free_page( mtr_t* mtr) /* in: mtr handle */ { return(fseg_alloc_free_page_general(seg_header, hint, direction, - FALSE, mtr, mtr)); + FALSE, mtr)); } /************************************************************************** diff --git a/storage/innobase/include/btr0btr.h b/storage/innobase/include/btr0btr.h index 3988019589d..269fa355558 100644 --- a/storage/innobase/include/btr0btr.h +++ b/storage/innobase/include/btr0btr.h @@ -379,11 +379,7 @@ btr_page_alloc( page split is made */ ulint level, /* in: level where the page is placed in the tree */ - mtr_t* mtr, /* in/out: mini-transaction - for the allocation */ - mtr_t* init_mtr); /* in/out: mini-transaction - for x-latching and initializing - the page */ + mtr_t* mtr); /* in: mtr */ /****************************************************************** Frees a file page used in an index tree. NOTE: cannot free field external storage pages because the page must contain info on its level. */ @@ -406,31 +402,6 @@ btr_page_free_low( page_t* page, /* in: page to be freed, x-latched */ ulint level, /* in: page level */ mtr_t* mtr); /* in: mtr */ -/**************************************************************//** -Marks all MTR_MEMO_FREE_CLUST_LEAF pages nonfree or free. -For invoking btr_store_big_rec_extern_fields() after an update, -we must temporarily mark freed clustered index pages allocated, so -that off-page columns will not be allocated from them. Between the -btr_store_big_rec_extern_fields() and mtr_commit() we have to -mark the pages free again, so that no pages will be leaked. */ - -void -btr_mark_freed_leaves( -/*==================*/ - dict_index_t* index, /* in/out: clustered index */ - mtr_t* mtr, /* in/out: mini-transaction */ - ibool nonfree);/* in: TRUE=mark nonfree, FALSE=mark freed */ -#ifdef UNIV_DEBUG -/**************************************************************//** -Validates all pages marked MTR_MEMO_FREE_CLUST_LEAF. -See btr_mark_freed_leaves(). */ - -ibool -btr_freed_leaves_validate( -/*======================*/ - /* out: TRUE if valid */ - mtr_t* mtr); /* in: mini-transaction */ -#endif /* UNIV_DEBUG */ #ifdef UNIV_BTR_PRINT /***************************************************************** Prints size info of a B-tree. */ diff --git a/storage/innobase/include/btr0cur.h b/storage/innobase/include/btr0cur.h index c2bf84ef9cb..c068d8d3318 100644 --- a/storage/innobase/include/btr0cur.h +++ b/storage/innobase/include/btr0cur.h @@ -252,6 +252,15 @@ btr_cur_pessimistic_update( updates */ que_thr_t* thr, /* in: query thread */ mtr_t* mtr); /* in: mtr */ +/***************************************************************** +Commits and restarts a mini-transaction so that it will retain an +x-lock on index->lock and the cursor page. */ + +void +btr_cur_mtr_commit_and_start( +/*=========================*/ + btr_cur_t* cursor, /* in: cursor */ + mtr_t* mtr); /* in/out: mini-transaction */ /*************************************************************** Marks a clustered index record deleted. Writes an undo log record to undo log on this delete marking. Writes in the trx id field the id @@ -462,11 +471,6 @@ btr_store_big_rec_extern_fields( this function returns */ big_rec_t* big_rec_vec, /* in: vector containing fields to be stored externally */ - mtr_t* alloc_mtr, /* in/out: in an insert, NULL; - in an update, local_mtr for - allocating BLOB pages and - updating BLOB pointers; alloc_mtr - must not have freed any leaf pages */ mtr_t* local_mtr); /* in: mtr containing the latch to rec and to the tree */ /*********************************************************************** diff --git a/storage/innobase/include/fsp0fsp.h b/storage/innobase/include/fsp0fsp.h index 4c58d6075e6..b7322944189 100644 --- a/storage/innobase/include/fsp0fsp.h +++ b/storage/innobase/include/fsp0fsp.h @@ -179,11 +179,7 @@ fseg_alloc_free_page_general( with fsp_reserve_free_extents, then there is no need to do the check for this individual page */ - mtr_t* mtr, /* in/out: mini-transaction */ - mtr_t* init_mtr);/* in/out: mtr or another mini-transaction - in which the page should be initialized, - or NULL if this is a "fake allocation" of - a page that was previously freed in mtr */ + mtr_t* mtr); /* in/out: mini-transaction */ /************************************************************************** Reserves free pages from a tablespace. All mini-transactions which may use several pages from the tablespace should call this function beforehand diff --git a/storage/innobase/include/mtr0mtr.h b/storage/innobase/include/mtr0mtr.h index 58983063361..2b41fa0059a 100644 --- a/storage/innobase/include/mtr0mtr.h +++ b/storage/innobase/include/mtr0mtr.h @@ -36,8 +36,6 @@ first 3 values must be RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH */ #define MTR_MEMO_MODIFY 54 #define MTR_MEMO_S_LOCK 55 #define MTR_MEMO_X_LOCK 56 -/* The mini-transaction freed a clustered index leaf page. */ -#define MTR_MEMO_FREE_CLUST_LEAF 57 /* Log item types: we have made them to be of the type 'byte' for the compiler to warn if val and type parameters are switched @@ -317,12 +315,9 @@ struct mtr_struct{ ulint state; /* MTR_ACTIVE, MTR_COMMITTING, MTR_COMMITTED */ dyn_array_t memo; /* memo stack for locks etc. */ dyn_array_t log; /* mini-transaction log */ - unsigned modifications:1; + ibool modifications; /* TRUE if the mtr made modifications to buffer pool pages */ - unsigned freed_clust_leaf:1; - /* TRUE if MTR_MEMO_FREE_CLUST_LEAF - was logged in the mini-transaction */ ulint n_log_recs; /* count of how many page initial log records have been written to the mtr log */ diff --git a/storage/innobase/include/mtr0mtr.ic b/storage/innobase/include/mtr0mtr.ic index 6b4cacf0766..81eec3bfc92 100644 --- a/storage/innobase/include/mtr0mtr.ic +++ b/storage/innobase/include/mtr0mtr.ic @@ -26,7 +26,6 @@ mtr_start( mtr->log_mode = MTR_LOG_ALL; mtr->modifications = FALSE; - mtr->freed_clust_leaf = FALSE; mtr->n_log_recs = 0; #ifdef UNIV_DEBUG @@ -51,8 +50,7 @@ mtr_memo_push( ut_ad(object); ut_ad(type >= MTR_MEMO_PAGE_S_FIX); - ut_ad(type <= MTR_MEMO_FREE_CLUST_LEAF); - ut_ad(type != MTR_MEMO_FREE_CLUST_LEAF || mtr->freed_clust_leaf); + ut_ad(type <= MTR_MEMO_X_LOCK); ut_ad(mtr); ut_ad(mtr->magic_n == MTR_MAGIC_N); diff --git a/storage/innobase/mtr/mtr0mtr.c b/storage/innobase/mtr/mtr0mtr.c index 33b71f0766c..728c37ce564 100644 --- a/storage/innobase/mtr/mtr0mtr.c +++ b/storage/innobase/mtr/mtr0mtr.c @@ -53,13 +53,17 @@ mtr_memo_slot_release( buf_page_release((buf_block_t*)object, type, mtr); } else if (type == MTR_MEMO_S_LOCK) { rw_lock_s_unlock((rw_lock_t*)object); - } else if (type != MTR_MEMO_X_LOCK) { - ut_ad(type == MTR_MEMO_MODIFY - || type == MTR_MEMO_FREE_CLUST_LEAF); +#ifdef UNIV_DEBUG + } else if (type == MTR_MEMO_X_LOCK) { + rw_lock_x_unlock((rw_lock_t*)object); + } else { + ut_ad(type == MTR_MEMO_MODIFY); ut_ad(mtr_memo_contains(mtr, object, MTR_MEMO_PAGE_X_FIX)); +#else } else { rw_lock_x_unlock((rw_lock_t*)object); +#endif } } diff --git a/storage/innobase/row/row0ins.c b/storage/innobase/row/row0ins.c index f6e6c81534b..c3b8f54a3c5 100644 --- a/storage/innobase/row/row0ins.c +++ b/storage/innobase/row/row0ins.c @@ -2090,20 +2090,15 @@ row_ins_index_entry_low( if (big_rec) { ut_a(err == DB_SUCCESS); /* Write out the externally stored - columns, but allocate the pages and - write the pointers using the - mini-transaction of the record update. - If any pages were freed in the update, - temporarily mark them allocated so - that off-page columns will not - overwrite them. We must do this, - because we will write the redo log for - the BLOB writes before writing the - redo log for the record update. Thus, - redo log application at crash recovery - will see BLOBs being written to free pages. */ - - btr_mark_freed_leaves(index, &mtr, TRUE); + columns while still x-latching + index->lock and block->lock. We have + to mtr_commit(mtr) first, so that the + redo log will be written in the + correct order. Otherwise, we would run + into trouble on crash recovery if mtr + freed B-tree pages on which some of + the big_rec fields will be written. */ + btr_cur_mtr_commit_and_start(&cursor, &mtr); rec = btr_cur_get_rec(&cursor); offsets = rec_get_offsets(rec, index, offsets, @@ -2111,8 +2106,7 @@ row_ins_index_entry_low( &heap); err = btr_store_big_rec_extern_fields( - index, rec, offsets, big_rec, - &mtr, &mtr); + index, rec, offsets, big_rec, &mtr); /* If writing big_rec fails (for example, because of DB_OUT_OF_FILE_SPACE), the record will be corrupted. Even if @@ -2125,9 +2119,6 @@ row_ins_index_entry_low( undo log, and thus the record cannot be rolled back. */ ut_a(err == DB_SUCCESS); - /* Free the pages again - in order to avoid a leak. */ - btr_mark_freed_leaves(index, &mtr, FALSE); goto stored_big_rec; } } else { @@ -2175,8 +2166,7 @@ function_exit: ULINT_UNDEFINED, &heap); err = btr_store_big_rec_extern_fields(index, rec, - offsets, big_rec, - NULL, &mtr); + offsets, big_rec, &mtr); stored_big_rec: if (modify) { dtuple_big_rec_free(big_rec); diff --git a/storage/innobase/row/row0row.c b/storage/innobase/row/row0row.c index ccb3c1f7781..171039e34ac 100644 --- a/storage/innobase/row/row0row.c +++ b/storage/innobase/row/row0row.c @@ -212,27 +212,23 @@ row_build( } #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG - if (rec_offs_any_null_extern(rec, offsets)) { - /* This condition can occur during crash recovery - before trx_rollback_or_clean_all_without_sess() has - completed execution. - - This condition is possible if the server crashed - during an insert or update before - btr_store_big_rec_extern_fields() did mtr_commit() all - BLOB pointers to the clustered index record. - - If the record contains a null BLOB pointer, look up the - transaction that holds the implicit lock on this record, and - assert that it is active. (In this version of InnoDB, we - cannot assert that it was recovered, because there is no - trx->is_recovered field.) */ - - ut_a(trx_assert_active( - row_get_rec_trx_id(rec, index, offsets))); - ut_a(trx_undo_roll_ptr_is_insert( - row_get_rec_roll_ptr(rec, index, offsets))); - } + /* This condition can occur during crash recovery before + trx_rollback_or_clean_all_without_sess() has completed + execution. + + This condition is possible if the server crashed + during an insert or update before + btr_store_big_rec_extern_fields() did mtr_commit() all + BLOB pointers to the clustered index record. + + If the record contains a null BLOB pointer, look up the + transaction that holds the implicit lock on this record, and + assert that it is active. (In this version of InnoDB, we + cannot assert that it was recovered, because there is no + trx->is_recovered field.) */ + + ut_a(!rec_offs_any_null_extern(rec, offsets) + || trx_assert_active(row_get_rec_trx_id(rec, index, offsets))); #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ if (type != ROW_COPY_POINTERS) { diff --git a/storage/innobase/row/row0upd.c b/storage/innobase/row/row0upd.c index 58739edfd98..694b00ea265 100644 --- a/storage/innobase/row/row0upd.c +++ b/storage/innobase/row/row0upd.c @@ -1591,22 +1591,21 @@ row_upd_clust_rec( *offsets_ = (sizeof offsets_) / sizeof *offsets_; ut_a(err == DB_SUCCESS); - /* Write out the externally stored columns, but - allocate the pages and write the pointers using the - mini-transaction of the record update. If any pages - were freed in the update, temporarily mark them - allocated so that off-page columns will not overwrite - them. We must do this, because we write the redo log - for the BLOB writes before writing the redo log for - the record update. */ - - btr_mark_freed_leaves(index, mtr, TRUE); + /* Write out the externally stored columns while still + x-latching index->lock and block->lock. We have to + mtr_commit(mtr) first, so that the redo log will be + written in the correct order. Otherwise, we would run + into trouble on crash recovery if mtr freed B-tree + pages on which some of the big_rec fields will be + written. */ + btr_cur_mtr_commit_and_start(btr_cur, mtr); + rec = btr_cur_get_rec(btr_cur); err = btr_store_big_rec_extern_fields( index, rec, rec_get_offsets(rec, index, offsets_, ULINT_UNDEFINED, &heap), - big_rec, mtr, mtr); + big_rec, mtr); if (UNIV_LIKELY_NULL(heap)) { mem_heap_free(heap); } @@ -1619,8 +1618,6 @@ row_upd_clust_rec( to the undo log, and thus the record cannot be rolled back. */ ut_a(err == DB_SUCCESS); - /* Free the pages again in order to avoid a leak. */ - btr_mark_freed_leaves(index, mtr, FALSE); } mtr_commit(mtr); diff --git a/storage/innobase/trx/trx0undo.c b/storage/innobase/trx/trx0undo.c index ce09862f317..329565943c8 100644 --- a/storage/innobase/trx/trx0undo.c +++ b/storage/innobase/trx/trx0undo.c @@ -864,7 +864,7 @@ trx_undo_add_page( page_no = fseg_alloc_free_page_general(header_page + TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER, undo->top_page_no + 1, FSP_UP, - TRUE, mtr, mtr); + TRUE, mtr); fil_space_release_free_extents(undo->space, n_reserved); diff --git a/storage/innodb_plugin/ChangeLog b/storage/innodb_plugin/ChangeLog index 4e6e2be615a..e6724eb08c2 100644 --- a/storage/innodb_plugin/ChangeLog +++ b/storage/innodb_plugin/ChangeLog @@ -50,15 +50,6 @@ * include/trx0undo.h, trx/trx0rec.c, trx/trx0undo.c: Fix Bug#12547647 UPDATE LOGGING COULD EXCEED LOG PAGE SIZE -2011-08-29 The InnoDB Team - - * btr/btr0btr.c, btr/btr0cur.c, fsp/fsp0fsp.c, - include/btr0btr.h, include/btr0cur.h, include/fsp0fsp.h, - include/mtr0mtr.h, include/mtr0mtr.ic, mtr/mtr0mtr.c, - row/row0ins.c, row/row0row.c, row/row0upd.c, trx/trx0undo.c: - Fix Bug#12704861 Corruption after a crash during BLOB update - and other regressions from the fix of Bug#12612184 - 2011-08-15 The InnoDB Team * btr/btr0btr.c, btr/btr0cur.c, btr/btr0pcur.c, btr/btr0sea.c, diff --git a/storage/innodb_plugin/btr/btr0btr.c b/storage/innodb_plugin/btr/btr0btr.c index 71e1599d19e..cb94ef08cd6 100644 --- a/storage/innodb_plugin/btr/btr0btr.c +++ b/storage/innodb_plugin/btr/btr0btr.c @@ -906,29 +906,28 @@ btr_page_alloc_for_ibuf( /**************************************************************//** Allocates a new file page to be used in an index tree. NOTE: we assume that the caller has made the reservation for free extents! -@return allocated page number, FIL_NULL if out of space */ -static __attribute__((nonnull(1,5), warn_unused_result)) -ulint -btr_page_alloc_low( -/*===============*/ +@return new allocated block, x-latched; NULL if out of space */ +UNIV_INTERN +buf_block_t* +btr_page_alloc( +/*===========*/ dict_index_t* index, /*!< in: index */ ulint hint_page_no, /*!< in: hint of a good page */ byte file_direction, /*!< in: direction where a possible page split is made */ ulint level, /*!< in: level where the page is placed in the tree */ - mtr_t* mtr, /*!< in/out: mini-transaction - for the allocation */ - mtr_t* init_mtr) /*!< in/out: mini-transaction - in which the page should be - initialized (may be the same - as mtr), or NULL if it should - not be initialized (the page - at hint was previously freed - in mtr) */ + mtr_t* mtr) /*!< in: mtr */ { fseg_header_t* seg_header; page_t* root; + buf_block_t* new_block; + ulint new_page_no; + + if (dict_index_is_ibuf(index)) { + + return(btr_page_alloc_for_ibuf(index, mtr)); + } root = btr_root_get(index, mtr); @@ -942,42 +941,8 @@ btr_page_alloc_low( reservation for free extents, and thus we know that a page can be allocated: */ - return(fseg_alloc_free_page_general( - seg_header, hint_page_no, file_direction, - TRUE, mtr, init_mtr)); -} - -/**************************************************************//** -Allocates a new file page to be used in an index tree. NOTE: we assume -that the caller has made the reservation for free extents! -@return new allocated block, x-latched; NULL if out of space */ -UNIV_INTERN -buf_block_t* -btr_page_alloc( -/*===========*/ - dict_index_t* index, /*!< in: index */ - ulint hint_page_no, /*!< in: hint of a good page */ - byte file_direction, /*!< in: direction where a possible - page split is made */ - ulint level, /*!< in: level where the page is placed - in the tree */ - mtr_t* mtr, /*!< in/out: mini-transaction - for the allocation */ - mtr_t* init_mtr) /*!< in/out: mini-transaction - for x-latching and initializing - the page */ -{ - buf_block_t* new_block; - ulint new_page_no; - - if (dict_index_is_ibuf(index)) { - - return(btr_page_alloc_for_ibuf(index, mtr)); - } - - new_page_no = btr_page_alloc_low( - index, hint_page_no, file_direction, level, mtr, init_mtr); - + new_page_no = fseg_alloc_free_page_general(seg_header, hint_page_no, + file_direction, TRUE, mtr); if (new_page_no == FIL_NULL) { return(NULL); @@ -985,16 +950,9 @@ btr_page_alloc( new_block = buf_page_get(dict_index_get_space(index), dict_table_zip_size(index->table), - new_page_no, RW_X_LATCH, init_mtr); + new_page_no, RW_X_LATCH, mtr); buf_block_dbg_add_level(new_block, SYNC_TREE_NODE_NEW); - if (mtr->freed_clust_leaf) { - mtr_memo_release(mtr, new_block, MTR_MEMO_FREE_CLUST_LEAF); - ut_ad(!mtr_memo_contains(mtr, new_block, - MTR_MEMO_FREE_CLUST_LEAF)); - } - - ut_ad(btr_freed_leaves_validate(mtr)); return(new_block); } @@ -1129,139 +1087,12 @@ btr_page_free( buf_block_t* block, /*!< in: block to be freed, x-latched */ mtr_t* mtr) /*!< in: mtr */ { - const page_t* page = buf_block_get_frame(block); - ulint level = btr_page_get_level(page, mtr); - - ut_ad(fil_page_get_type(block->frame) == FIL_PAGE_INDEX); - btr_page_free_low(index, block, level, mtr); - - /* The handling of MTR_MEMO_FREE_CLUST_LEAF assumes this. */ - ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); - - if (level == 0 && dict_index_is_clust(index)) { - /* We may have to call btr_mark_freed_leaves() to - temporarily mark the block nonfree for invoking - btr_store_big_rec_extern_fields_func() after an - update. Remember that the block was freed. */ - mtr->freed_clust_leaf = TRUE; - mtr_memo_push(mtr, block, MTR_MEMO_FREE_CLUST_LEAF); - } - - ut_ad(btr_freed_leaves_validate(mtr)); -} - -/**************************************************************//** -Marks all MTR_MEMO_FREE_CLUST_LEAF pages nonfree or free. -For invoking btr_store_big_rec_extern_fields() after an update, -we must temporarily mark freed clustered index pages allocated, so -that off-page columns will not be allocated from them. Between the -btr_store_big_rec_extern_fields() and mtr_commit() we have to -mark the pages free again, so that no pages will be leaked. */ -UNIV_INTERN -void -btr_mark_freed_leaves( -/*==================*/ - dict_index_t* index, /*!< in/out: clustered index */ - mtr_t* mtr, /*!< in/out: mini-transaction */ - ibool nonfree)/*!< in: TRUE=mark nonfree, FALSE=mark freed */ -{ - /* This is loosely based on mtr_memo_release(). */ - - ulint offset; - - ut_ad(dict_index_is_clust(index)); - ut_ad(mtr->magic_n == MTR_MAGIC_N); - ut_ad(mtr->state == MTR_ACTIVE); - - if (!mtr->freed_clust_leaf) { - return; - } - - offset = dyn_array_get_data_size(&mtr->memo); - - while (offset > 0) { - mtr_memo_slot_t* slot; - buf_block_t* block; - - offset -= sizeof *slot; - - slot = dyn_array_get_element(&mtr->memo, offset); - - if (slot->type != MTR_MEMO_FREE_CLUST_LEAF) { - continue; - } - - /* Because btr_page_alloc() does invoke - mtr_memo_release on MTR_MEMO_FREE_CLUST_LEAF, all - blocks tagged with MTR_MEMO_FREE_CLUST_LEAF in the - memo must still be clustered index leaf tree pages. */ - block = slot->object; - ut_a(buf_block_get_space(block) - == dict_index_get_space(index)); - ut_a(fil_page_get_type(buf_block_get_frame(block)) - == FIL_PAGE_INDEX); - ut_a(page_is_leaf(buf_block_get_frame(block))); - - if (nonfree) { - /* Allocate the same page again. */ - ulint page_no; - page_no = btr_page_alloc_low( - index, buf_block_get_page_no(block), - FSP_NO_DIR, 0, mtr, NULL); - ut_a(page_no == buf_block_get_page_no(block)); - } else { - /* Assert that the page is allocated and free it. */ - btr_page_free_low(index, block, 0, mtr); - } - } - - ut_ad(btr_freed_leaves_validate(mtr)); -} - -#ifdef UNIV_DEBUG -/**************************************************************//** -Validates all pages marked MTR_MEMO_FREE_CLUST_LEAF. -@see btr_mark_freed_leaves() -@return TRUE */ -UNIV_INTERN -ibool -btr_freed_leaves_validate( -/*======================*/ - mtr_t* mtr) /*!< in: mini-transaction */ -{ - ulint offset; - - ut_ad(mtr->magic_n == MTR_MAGIC_N); - ut_ad(mtr->state == MTR_ACTIVE); - - offset = dyn_array_get_data_size(&mtr->memo); - - while (offset > 0) { - const mtr_memo_slot_t* slot; - const buf_block_t* block; - - offset -= sizeof *slot; - - slot = dyn_array_get_element(&mtr->memo, offset); - - if (slot->type != MTR_MEMO_FREE_CLUST_LEAF) { - continue; - } + ulint level; - ut_a(mtr->freed_clust_leaf); - /* Because btr_page_alloc() does invoke - mtr_memo_release on MTR_MEMO_FREE_CLUST_LEAF, all - blocks tagged with MTR_MEMO_FREE_CLUST_LEAF in the - memo must still be clustered index leaf tree pages. */ - block = slot->object; - ut_a(fil_page_get_type(buf_block_get_frame(block)) - == FIL_PAGE_INDEX); - ut_a(page_is_leaf(buf_block_get_frame(block))); - } + level = btr_page_get_level(buf_block_get_frame(block), mtr); - return(TRUE); + btr_page_free_low(index, block, level, mtr); } -#endif /* UNIV_DEBUG */ /**************************************************************//** Sets the child node file address in a node pointer. */ @@ -1984,7 +1815,7 @@ btr_root_raise_and_insert( level = btr_page_get_level(root, mtr); - new_block = btr_page_alloc(index, 0, FSP_NO_DIR, level, mtr, mtr); + new_block = btr_page_alloc(index, 0, FSP_NO_DIR, level, mtr); new_page = buf_block_get_frame(new_block); new_page_zip = buf_block_get_page_zip(new_block); ut_a(!new_page_zip == !root_page_zip); @@ -2720,7 +2551,7 @@ func_start: /* 2. Allocate a new page to the index */ new_block = btr_page_alloc(cursor->index, hint_page_no, direction, - btr_page_get_level(page, mtr), mtr, mtr); + btr_page_get_level(page, mtr), mtr); new_page = buf_block_get_frame(new_block); new_page_zip = buf_block_get_page_zip(new_block); btr_page_create(new_block, new_page_zip, cursor->index, diff --git a/storage/innodb_plugin/btr/btr0cur.c b/storage/innodb_plugin/btr/btr0cur.c index 1e603a6fc81..4c862f061c5 100644 --- a/storage/innodb_plugin/btr/btr0cur.c +++ b/storage/innodb_plugin/btr/btr0cur.c @@ -2421,6 +2421,39 @@ return_after_reservations: return(err); } +/**************************************************************//** +Commits and restarts a mini-transaction so that it will retain an +x-lock on index->lock and the cursor page. */ +UNIV_INTERN +void +btr_cur_mtr_commit_and_start( +/*=========================*/ + btr_cur_t* cursor, /*!< in: cursor */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + buf_block_t* block; + + block = btr_cur_get_block(cursor); + + ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(cursor->index), + MTR_MEMO_X_LOCK)); + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + /* Keep the locks across the mtr_commit(mtr). */ + rw_lock_x_lock(dict_index_get_lock(cursor->index)); + rw_lock_x_lock(&block->lock); + mutex_enter(&block->mutex); + buf_block_buf_fix_inc(block, __FILE__, __LINE__); + mutex_exit(&block->mutex); + /* Write out the redo log. */ + mtr_commit(mtr); + mtr_start(mtr); + /* Reassociate the locks with the mini-transaction. + They will be released on mtr_commit(mtr). */ + mtr_memo_push(mtr, dict_index_get_lock(cursor->index), + MTR_MEMO_X_LOCK); + mtr_memo_push(mtr, block, MTR_MEMO_PAGE_X_FIX); +} + /*==================== B-TREE DELETE MARK AND UNMARK ===============*/ /****************************************************************//** @@ -3863,9 +3896,6 @@ btr_store_big_rec_extern_fields_func( the "external storage" flags in offsets will not correspond to rec when this function returns */ - const big_rec_t*big_rec_vec, /*!< in: vector containing fields - to be stored externally */ - #ifdef UNIV_DEBUG mtr_t* local_mtr, /*!< in: mtr containing the latch to rec and to the tree */ @@ -3874,11 +3904,9 @@ btr_store_big_rec_extern_fields_func( ibool update_in_place,/*! in: TRUE if the record is updated in place (not delete+insert) */ #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ - mtr_t* alloc_mtr) /*!< in/out: in an insert, NULL; - in an update, local_mtr for - allocating BLOB pages and - updating BLOB pointers; alloc_mtr - must not have freed any leaf pages */ + const big_rec_t*big_rec_vec) /*!< in: vector containing fields + to be stored externally */ + { ulint rec_page_no; byte* field_ref; @@ -3897,9 +3925,6 @@ btr_store_big_rec_extern_fields_func( ut_ad(rec_offs_validate(rec, index, offsets)); ut_ad(rec_offs_any_extern(offsets)); - ut_ad(local_mtr); - ut_ad(!alloc_mtr || alloc_mtr == local_mtr); - ut_ad(!update_in_place || alloc_mtr); ut_ad(mtr_memo_contains(local_mtr, dict_index_get_lock(index), MTR_MEMO_X_LOCK)); ut_ad(mtr_memo_contains(local_mtr, rec_block, MTR_MEMO_PAGE_X_FIX)); @@ -3915,25 +3940,6 @@ btr_store_big_rec_extern_fields_func( rec_page_no = buf_block_get_page_no(rec_block); ut_a(fil_page_get_type(page_align(rec)) == FIL_PAGE_INDEX); - if (alloc_mtr) { - /* Because alloc_mtr will be committed after - mtr, it is possible that the tablespace has been - extended when the B-tree record was updated or - inserted, or it will be extended while allocating - pages for big_rec. - - TODO: In mtr (not alloc_mtr), write a redo log record - about extending the tablespace to its current size, - and remember the current size. Whenever the tablespace - grows as pages are allocated, write further redo log - records to mtr. (Currently tablespace extension is not - covered by the redo log. If it were, the record would - only be written to alloc_mtr, which is committed after - mtr.) */ - } else { - alloc_mtr = &mtr; - } - if (UNIV_LIKELY_NULL(page_zip)) { int err; @@ -4010,7 +4016,7 @@ btr_store_big_rec_extern_fields_func( } block = btr_page_alloc(index, hint_page_no, - FSP_NO_DIR, 0, alloc_mtr, &mtr); + FSP_NO_DIR, 0, &mtr); if (UNIV_UNLIKELY(block == NULL)) { mtr_commit(&mtr); @@ -4137,15 +4143,11 @@ btr_store_big_rec_extern_fields_func( goto next_zip_page; } - if (alloc_mtr == &mtr) { - rec_block = buf_page_get( - space_id, zip_size, - rec_page_no, - RW_X_LATCH, &mtr); - buf_block_dbg_add_level( - rec_block, - SYNC_NO_ORDER_CHECK); - } + rec_block = buf_page_get(space_id, zip_size, + rec_page_no, + RW_X_LATCH, &mtr); + buf_block_dbg_add_level(rec_block, + SYNC_NO_ORDER_CHECK); if (err == Z_STREAM_END) { mach_write_to_4(field_ref @@ -4179,8 +4181,7 @@ btr_store_big_rec_extern_fields_func( page_zip_write_blob_ptr( page_zip, rec, index, offsets, - big_rec_vec->fields[i].field_no, - alloc_mtr); + big_rec_vec->fields[i].field_no, &mtr); next_zip_page: prev_page_no = page_no; @@ -4225,23 +4226,19 @@ next_zip_page: extern_len -= store_len; - if (alloc_mtr == &mtr) { - rec_block = buf_page_get( - space_id, zip_size, - rec_page_no, - RW_X_LATCH, &mtr); - buf_block_dbg_add_level( - rec_block, - SYNC_NO_ORDER_CHECK); - } + rec_block = buf_page_get(space_id, zip_size, + rec_page_no, + RW_X_LATCH, &mtr); + buf_block_dbg_add_level(rec_block, + SYNC_NO_ORDER_CHECK); mlog_write_ulint(field_ref + BTR_EXTERN_LEN, 0, - MLOG_4BYTES, alloc_mtr); + MLOG_4BYTES, &mtr); mlog_write_ulint(field_ref + BTR_EXTERN_LEN + 4, big_rec_vec->fields[i].len - extern_len, - MLOG_4BYTES, alloc_mtr); + MLOG_4BYTES, &mtr); if (prev_page_no == FIL_NULL) { btr_blob_dbg_add_blob( @@ -4251,19 +4248,18 @@ next_zip_page: mlog_write_ulint(field_ref + BTR_EXTERN_SPACE_ID, - space_id, MLOG_4BYTES, - alloc_mtr); + space_id, + MLOG_4BYTES, &mtr); mlog_write_ulint(field_ref + BTR_EXTERN_PAGE_NO, - page_no, MLOG_4BYTES, - alloc_mtr); + page_no, + MLOG_4BYTES, &mtr); mlog_write_ulint(field_ref + BTR_EXTERN_OFFSET, FIL_PAGE_DATA, - MLOG_4BYTES, - alloc_mtr); + MLOG_4BYTES, &mtr); } prev_page_no = page_no; diff --git a/storage/innodb_plugin/fsp/fsp0fsp.c b/storage/innodb_plugin/fsp/fsp0fsp.c index 19846b63d5b..fee7fde2e5c 100644 --- a/storage/innodb_plugin/fsp/fsp0fsp.c +++ b/storage/innodb_plugin/fsp/fsp0fsp.c @@ -312,9 +312,8 @@ fsp_fill_free_list( descriptor page and ibuf bitmap page; then we do not allocate more extents */ ulint space, /*!< in: space */ - fsp_header_t* header, /*!< in/out: space header */ - mtr_t* mtr) /*!< in/out: mini-transaction */ - __attribute__((nonnull)); + fsp_header_t* header, /*!< in: space header */ + mtr_t* mtr); /*!< in: mtr */ /**********************************************************************//** Allocates a single free page from a segment. This function implements the intelligent allocation strategy which tries to minimize file space @@ -334,13 +333,7 @@ fseg_alloc_free_page_low( inserted there in order, into which direction they go alphabetically: FSP_DOWN, FSP_UP, FSP_NO_DIR */ - mtr_t* mtr, /*!< in/out: mini-transaction */ - mtr_t* init_mtr)/*!< in/out: mini-transaction in which the - page should be initialized - (may be the same as mtr), or NULL if it - should not be initialized (the page at hint - was previously freed in mtr) */ - __attribute__((warn_unused_result, nonnull(3,6))); + mtr_t* mtr); /*!< in/out: mini-transaction */ #endif /* !UNIV_HOTBACKUP */ /**********************************************************************//** @@ -708,18 +701,17 @@ list, if not free limit == space size. This adding is necessary to make the descriptor defined, as they are uninitialized above the free limit. @return pointer to the extent descriptor, NULL if the page does not exist in the space or if the offset exceeds the free limit */ -UNIV_INLINE __attribute__((nonnull, warn_unused_result)) +UNIV_INLINE xdes_t* xdes_get_descriptor_with_space_hdr( /*===============================*/ - fsp_header_t* sp_header, /*!< in/out: space header, x-latched - in mtr */ - ulint space, /*!< in: space id */ - ulint offset, /*!< in: page offset; if equal - to the free limit, we try to - add new extents to the space - free list */ - mtr_t* mtr) /*!< in/out: mini-transaction */ + fsp_header_t* sp_header,/*!< in/out: space header, x-latched */ + ulint space, /*!< in: space id */ + ulint offset, /*!< in: page offset; + if equal to the free limit, + we try to add new extents to + the space free list */ + mtr_t* mtr) /*!< in: mtr handle */ { ulint limit; ulint size; @@ -727,9 +719,11 @@ xdes_get_descriptor_with_space_hdr( ulint descr_page_no; page_t* descr_page; + ut_ad(mtr); ut_ad(mtr_memo_contains(mtr, fil_space_get_latch(space, NULL), MTR_MEMO_X_LOCK)); - ut_ad(mtr_memo_contains_page(mtr, sp_header, MTR_MEMO_PAGE_X_FIX)); + ut_ad(mtr_memo_contains_page(mtr, sp_header, MTR_MEMO_PAGE_S_FIX) + || mtr_memo_contains_page(mtr, sp_header, MTR_MEMO_PAGE_X_FIX)); ut_ad(page_offset(sp_header) == FSP_HEADER_OFFSET); /* Read free limit and space size */ limit = mach_read_from_4(sp_header + FSP_FREE_LIMIT); @@ -779,7 +773,7 @@ is necessary to make the descriptor defined, as they are uninitialized above the free limit. @return pointer to the extent descriptor, NULL if the page does not exist in the space or if the offset exceeds the free limit */ -static __attribute__((nonnull, warn_unused_result)) +static xdes_t* xdes_get_descriptor( /*================*/ @@ -788,7 +782,7 @@ xdes_get_descriptor( or 0 for uncompressed pages */ ulint offset, /*!< in: page offset; if equal to the free limit, we try to add new extents to the space free list */ - mtr_t* mtr) /*!< in/out: mini-transaction */ + mtr_t* mtr) /*!< in: mtr handle */ { buf_block_t* block; fsp_header_t* sp_header; @@ -1166,14 +1160,14 @@ fsp_header_get_tablespace_size(void) Tries to extend a single-table tablespace so that a page would fit in the data file. @return TRUE if success */ -static __attribute__((nonnull, warn_unused_result)) +static ibool fsp_try_extend_data_file_with_pages( /*================================*/ ulint space, /*!< in: space */ ulint page_no, /*!< in: page number */ - fsp_header_t* header, /*!< in/out: space header */ - mtr_t* mtr) /*!< in/out: mini-transaction */ + fsp_header_t* header, /*!< in: space header */ + mtr_t* mtr) /*!< in: mtr */ { ibool success; ulint actual_size; @@ -1198,7 +1192,7 @@ fsp_try_extend_data_file_with_pages( /***********************************************************************//** Tries to extend the last data file of a tablespace if it is auto-extending. @return FALSE if not auto-extending */ -static __attribute__((nonnull)) +static ibool fsp_try_extend_data_file( /*=====================*/ @@ -1208,8 +1202,8 @@ fsp_try_extend_data_file( the actual file size rounded down to megabyte */ ulint space, /*!< in: space */ - fsp_header_t* header, /*!< in/out: space header */ - mtr_t* mtr) /*!< in/out: mini-transaction */ + fsp_header_t* header, /*!< in: space header */ + mtr_t* mtr) /*!< in: mtr */ { ulint size; ulint zip_size; @@ -1345,7 +1339,7 @@ fsp_fill_free_list( then we do not allocate more extents */ ulint space, /*!< in: space */ fsp_header_t* header, /*!< in/out: space header */ - mtr_t* mtr) /*!< in/out: mini-transaction */ + mtr_t* mtr) /*!< in: mtr */ { ulint limit; ulint size; @@ -1544,46 +1538,9 @@ fsp_alloc_free_extent( } /**********************************************************************//** -Allocates a single free page from a space. */ -static __attribute__((nonnull)) -void -fsp_alloc_from_free_frag( -/*=====================*/ - fsp_header_t* header, /*!< in/out: tablespace header */ - xdes_t* descr, /*!< in/out: extent descriptor */ - ulint bit, /*!< in: slot to allocate in the extent */ - mtr_t* mtr) /*!< in/out: mini-transaction */ -{ - ulint frag_n_used; - - ut_ad(xdes_get_state(descr, mtr) == XDES_FREE_FRAG); - ut_a(xdes_get_bit(descr, XDES_FREE_BIT, bit, mtr)); - xdes_set_bit(descr, XDES_FREE_BIT, bit, FALSE, mtr); - - /* Update the FRAG_N_USED field */ - frag_n_used = mtr_read_ulint(header + FSP_FRAG_N_USED, MLOG_4BYTES, - mtr); - frag_n_used++; - mlog_write_ulint(header + FSP_FRAG_N_USED, frag_n_used, MLOG_4BYTES, - mtr); - if (xdes_is_full(descr, mtr)) { - /* The fragment is full: move it to another list */ - flst_remove(header + FSP_FREE_FRAG, descr + XDES_FLST_NODE, - mtr); - xdes_set_state(descr, XDES_FULL_FRAG, mtr); - - flst_add_last(header + FSP_FULL_FRAG, descr + XDES_FLST_NODE, - mtr); - mlog_write_ulint(header + FSP_FRAG_N_USED, - frag_n_used - FSP_EXTENT_SIZE, MLOG_4BYTES, - mtr); - } -} - -/**********************************************************************//** Allocates a single free page from a space. The page is marked as used. @return the page offset, FIL_NULL if no page could be allocated */ -static __attribute__((nonnull, warn_unused_result)) +static ulint fsp_alloc_free_page( /*================*/ @@ -1591,22 +1548,19 @@ fsp_alloc_free_page( ulint zip_size,/*!< in: compressed page size in bytes or 0 for uncompressed pages */ ulint hint, /*!< in: hint of which page would be desirable */ - mtr_t* mtr, /*!< in/out: mini-transaction */ - mtr_t* init_mtr)/*!< in/out: mini-transaction in which the - page should be initialized - (may be the same as mtr) */ + mtr_t* mtr) /*!< in/out: mini-transaction */ { fsp_header_t* header; fil_addr_t first; xdes_t* descr; buf_block_t* block; ulint free; + ulint frag_n_used; ulint page_no; ulint space_size; ibool success; ut_ad(mtr); - ut_ad(init_mtr); header = fsp_get_space_header(space, zip_size, mtr); @@ -1688,19 +1642,38 @@ fsp_alloc_free_page( } } - fsp_alloc_from_free_frag(header, descr, free, mtr); + xdes_set_bit(descr, XDES_FREE_BIT, free, FALSE, mtr); + + /* Update the FRAG_N_USED field */ + frag_n_used = mtr_read_ulint(header + FSP_FRAG_N_USED, MLOG_4BYTES, + mtr); + frag_n_used++; + mlog_write_ulint(header + FSP_FRAG_N_USED, frag_n_used, MLOG_4BYTES, + mtr); + if (xdes_is_full(descr, mtr)) { + /* The fragment is full: move it to another list */ + flst_remove(header + FSP_FREE_FRAG, descr + XDES_FLST_NODE, + mtr); + xdes_set_state(descr, XDES_FULL_FRAG, mtr); + + flst_add_last(header + FSP_FULL_FRAG, descr + XDES_FLST_NODE, + mtr); + mlog_write_ulint(header + FSP_FRAG_N_USED, + frag_n_used - FSP_EXTENT_SIZE, MLOG_4BYTES, + mtr); + } /* Initialize the allocated page to the buffer pool, so that it can be obtained immediately with buf_page_get without need for a disk read. */ - buf_page_create(space, page_no, zip_size, init_mtr); + buf_page_create(space, page_no, zip_size, mtr); - block = buf_page_get(space, zip_size, page_no, RW_X_LATCH, init_mtr); + block = buf_page_get(space, zip_size, page_no, RW_X_LATCH, mtr); buf_block_dbg_add_level(block, SYNC_FSP_PAGE); /* Prior contents of the page should be ignored */ - fsp_init_file_page(block, init_mtr); + fsp_init_file_page(block, mtr); return(page_no); } @@ -1936,7 +1909,7 @@ fsp_alloc_seg_inode_page( zip_size = dict_table_flags_to_zip_size( mach_read_from_4(FSP_SPACE_FLAGS + space_header)); - page_no = fsp_alloc_free_page(space, zip_size, 0, mtr, mtr); + page_no = fsp_alloc_free_page(space, zip_size, 0, mtr); if (page_no == FIL_NULL) { @@ -2350,7 +2323,7 @@ fseg_create_general( if (page == 0) { page = fseg_alloc_free_page_low(space, zip_size, - inode, 0, FSP_UP, mtr, mtr); + inode, 0, FSP_UP, mtr); if (page == FIL_NULL) { @@ -2606,12 +2579,7 @@ fseg_alloc_free_page_low( inserted there in order, into which direction they go alphabetically: FSP_DOWN, FSP_UP, FSP_NO_DIR */ - mtr_t* mtr, /*!< in/out: mini-transaction */ - mtr_t* init_mtr)/*!< in/out: mini-transaction in which the - page should be initialized - (may be the same as mtr), or NULL if it - should not be initialized (the page at hint - was previously freed in mtr) */ + mtr_t* mtr) /*!< in/out: mini-transaction */ { fsp_header_t* space_header; ulint space_size; @@ -2622,6 +2590,7 @@ fseg_alloc_free_page_low( ulint ret_page; /*!< the allocated page offset, FIL_NULL if could not be allocated */ xdes_t* ret_descr; /*!< the extent of the allocated page */ + ibool frag_page_allocated = FALSE; ibool success; ulint n; @@ -2643,8 +2612,6 @@ fseg_alloc_free_page_low( if (descr == NULL) { /* Hint outside space or too high above free limit: reset hint */ - ut_a(init_mtr); - /* The file space header page is always allocated. */ hint = 0; descr = xdes_get_descriptor(space, zip_size, hint, mtr); } @@ -2656,20 +2623,15 @@ fseg_alloc_free_page_low( mtr), seg_id)) && (xdes_get_bit(descr, XDES_FREE_BIT, hint % FSP_EXTENT_SIZE, mtr) == TRUE)) { -take_hinted_page: + /* 1. We can take the hinted page =================================*/ ret_descr = descr; ret_page = hint; - /* Skip the check for extending the tablespace. If the - page hint were not within the size of the tablespace, - we would have got (descr == NULL) above and reset the hint. */ - goto got_hinted_page; /*-----------------------------------------------------------*/ - } else if (xdes_get_state(descr, mtr) == XDES_FREE - && (!init_mtr - || ((reserved - used < reserved / FSEG_FILLFACTOR) - && used >= FSEG_FRAG_LIMIT))) { + } else if ((xdes_get_state(descr, mtr) == XDES_FREE) + && ((reserved - used) < reserved / FSEG_FILLFACTOR) + && (used >= FSEG_FRAG_LIMIT)) { /* 2. We allocate the free extent from space and can take ========================================================= @@ -2687,20 +2649,8 @@ take_hinted_page: /* Try to fill the segment free list */ fseg_fill_free_list(seg_inode, space, zip_size, hint + FSP_EXTENT_SIZE, mtr); - goto take_hinted_page; - /*-----------------------------------------------------------*/ - } else if (!init_mtr) { - ut_a(xdes_get_state(descr, mtr) == XDES_FREE_FRAG); - fsp_alloc_from_free_frag(space_header, descr, - hint % FSP_EXTENT_SIZE, mtr); ret_page = hint; - ret_descr = NULL; - - /* Put the page in the fragment page array of the segment */ - n = fseg_find_free_frag_page_slot(seg_inode, mtr); - ut_a(n != FIL_NULL); - fseg_set_nth_frag_page_no(seg_inode, n, ret_page, mtr); - goto got_hinted_page; + /*-----------------------------------------------------------*/ } else if ((direction != FSP_NO_DIR) && ((reserved - used) < reserved / FSEG_FILLFACTOR) && (used >= FSEG_FRAG_LIMIT) @@ -2760,10 +2710,11 @@ take_hinted_page: } else if (used < FSEG_FRAG_LIMIT) { /* 6. We allocate an individual page from the space ===================================================*/ - ret_page = fsp_alloc_free_page(space, zip_size, hint, - mtr, init_mtr); + ret_page = fsp_alloc_free_page(space, zip_size, hint, mtr); ret_descr = NULL; + frag_page_allocated = TRUE; + if (ret_page != FIL_NULL) { /* Put the page in the fragment page array of the segment */ @@ -2773,10 +2724,6 @@ take_hinted_page: fseg_set_nth_frag_page_no(seg_inode, n, ret_page, mtr); } - - /* fsp_alloc_free_page() invoked fsp_init_file_page() - already. */ - return(ret_page); /*-----------------------------------------------------------*/ } else { /* 7. We allocate a new extent and take its first page @@ -2824,34 +2771,26 @@ take_hinted_page: } } -got_hinted_page: - { + if (!frag_page_allocated) { /* Initialize the allocated page to buffer pool, so that it can be obtained immediately with buf_page_get without need for a disk read */ buf_block_t* block; ulint zip_size = dict_table_flags_to_zip_size( mach_read_from_4(FSP_SPACE_FLAGS + space_header)); - mtr_t* block_mtr = init_mtr ? init_mtr : mtr; - block = buf_page_create(space, ret_page, zip_size, block_mtr); + block = buf_page_create(space, ret_page, zip_size, mtr); buf_block_dbg_add_level(block, SYNC_FSP_PAGE); if (UNIV_UNLIKELY(block != buf_page_get(space, zip_size, ret_page, RW_X_LATCH, - block_mtr))) { + mtr))) { ut_error; } - if (init_mtr) { - /* The prior contents of the page should be ignored */ - fsp_init_file_page(block, init_mtr); - } - } + /* The prior contents of the page should be ignored */ + fsp_init_file_page(block, mtr); - /* ret_descr == NULL if the block was allocated from free_frag - (XDES_FREE_FRAG) */ - if (ret_descr != NULL) { /* At this point we know the extent and the page offset. The extent is still in the appropriate list (FSEG_NOT_FULL or FSEG_FREE), and the page is not yet marked as used. */ @@ -2888,11 +2827,7 @@ fseg_alloc_free_page_general( with fsp_reserve_free_extents, then there is no need to do the check for this individual page */ - mtr_t* mtr, /*!< in/out: mini-transaction handle */ - mtr_t* init_mtr)/*!< in/out: mtr or another mini-transaction - in which the page should be initialized, - or NULL if this is a "fake allocation" of - a page that was previously freed in mtr */ + mtr_t* mtr) /*!< in/out: mini-transaction */ { fseg_inode_t* inode; ulint space; @@ -2934,8 +2869,7 @@ fseg_alloc_free_page_general( } page_no = fseg_alloc_free_page_low(space, zip_size, - inode, hint, direction, - mtr, init_mtr); + inode, hint, direction, mtr); if (!has_done_reservation) { fil_space_release_free_extents(space, n_reserved); } @@ -2944,6 +2878,28 @@ fseg_alloc_free_page_general( } /**********************************************************************//** +Allocates a single free page from a segment. This function implements +the intelligent allocation strategy which tries to minimize file space +fragmentation. +@return allocated page offset, FIL_NULL if no page could be allocated */ +UNIV_INTERN +ulint +fseg_alloc_free_page( +/*=================*/ + fseg_header_t* seg_header,/*!< in: segment header */ + ulint hint, /*!< in: hint of which page would be desirable */ + byte direction,/*!< in: if the new page is needed because + of an index page split, and records are + inserted there in order, into which + direction they go alphabetically: FSP_DOWN, + FSP_UP, FSP_NO_DIR */ + mtr_t* mtr) /*!< in: mtr handle */ +{ + return(fseg_alloc_free_page_general(seg_header, hint, direction, + FALSE, mtr)); +} + +/**********************************************************************//** Checks that we have at least 2 frag pages free in the first extent of a single-table tablespace, and they are also physically initialized to the data file. That is we have already extended the data file so that those pages are diff --git a/storage/innodb_plugin/include/btr0btr.h b/storage/innodb_plugin/include/btr0btr.h index 476ad29adac..c0a038dd21d 100644 --- a/storage/innodb_plugin/include/btr0btr.h +++ b/storage/innodb_plugin/include/btr0btr.h @@ -557,12 +557,7 @@ btr_page_alloc( page split is made */ ulint level, /*!< in: level where the page is placed in the tree */ - mtr_t* mtr, /*!< in/out: mini-transaction - for the allocation */ - mtr_t* init_mtr) /*!< in/out: mini-transaction - for x-latching and initializing - the page */ - __attribute__((nonnull, warn_unused_result)); + mtr_t* mtr); /*!< in: mtr */ /**************************************************************//** Frees a file page used in an index tree. NOTE: cannot free field external storage pages because the page must contain info on its level. */ @@ -585,33 +580,6 @@ btr_page_free_low( buf_block_t* block, /*!< in: block to be freed, x-latched */ ulint level, /*!< in: page level */ mtr_t* mtr); /*!< in: mtr */ -/**************************************************************//** -Marks all MTR_MEMO_FREE_CLUST_LEAF pages nonfree or free. -For invoking btr_store_big_rec_extern_fields() after an update, -we must temporarily mark freed clustered index pages allocated, so -that off-page columns will not be allocated from them. Between the -btr_store_big_rec_extern_fields() and mtr_commit() we have to -mark the pages free again, so that no pages will be leaked. */ -UNIV_INTERN -void -btr_mark_freed_leaves( -/*==================*/ - dict_index_t* index, /*!< in/out: clustered index */ - mtr_t* mtr, /*!< in/out: mini-transaction */ - ibool nonfree)/*!< in: TRUE=mark nonfree, FALSE=mark freed */ - __attribute__((nonnull)); -#ifdef UNIV_DEBUG -/**************************************************************//** -Validates all pages marked MTR_MEMO_FREE_CLUST_LEAF. -@see btr_mark_freed_leaves() -@return TRUE */ -UNIV_INTERN -ibool -btr_freed_leaves_validate( -/*======================*/ - mtr_t* mtr) /*!< in: mini-transaction */ - __attribute__((nonnull, warn_unused_result)); -#endif /* UNIV_DEBUG */ #ifdef UNIV_BTR_PRINT /*************************************************************//** Prints size info of a B-tree. */ diff --git a/storage/innodb_plugin/include/btr0cur.h b/storage/innodb_plugin/include/btr0cur.h index 1d97c5b9452..6094a2a6c7a 100644 --- a/storage/innodb_plugin/include/btr0cur.h +++ b/storage/innodb_plugin/include/btr0cur.h @@ -326,6 +326,16 @@ btr_cur_pessimistic_update( que_thr_t* thr, /*!< in: query thread */ mtr_t* mtr); /*!< in: mtr; must be committed before latching any further pages */ +/***************************************************************** +Commits and restarts a mini-transaction so that it will retain an +x-lock on index->lock and the cursor page. */ +UNIV_INTERN +void +btr_cur_mtr_commit_and_start( +/*=========================*/ + btr_cur_t* cursor, /*!< in: cursor */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + __attribute__((nonnull)); /***********************************************************//** Marks a clustered index record deleted. Writes an undo log record to undo log on this delete marking. Writes in the trx id field the id @@ -530,8 +540,6 @@ btr_store_big_rec_extern_fields_func( the "external storage" flags in offsets will not correspond to rec when this function returns */ - const big_rec_t*big_rec_vec, /*!< in: vector containing fields - to be stored externally */ #ifdef UNIV_DEBUG mtr_t* local_mtr, /*!< in: mtr containing the latch to rec and to the tree */ @@ -540,12 +548,9 @@ btr_store_big_rec_extern_fields_func( ibool update_in_place,/*! in: TRUE if the record is updated in place (not delete+insert) */ #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ - mtr_t* alloc_mtr) /*!< in/out: in an insert, NULL; - in an update, local_mtr for - allocating BLOB pages and - updating BLOB pointers; alloc_mtr - must not have freed any leaf pages */ - __attribute__((nonnull(1,2,3,4,5), warn_unused_result)); + const big_rec_t*big_rec_vec) /*!< in: vector containing fields + to be stored externally */ + __attribute__((nonnull)); /** Stores the fields in big_rec_vec to the tablespace and puts pointers to them in rec. The extern flags in rec will have to be set beforehand. @@ -554,22 +559,21 @@ file segment of the index tree. @param index in: clustered index; MUST be X-latched by mtr @param b in/out: block containing rec; MUST be X-latched by mtr @param rec in/out: clustered index record -@param offs in: rec_get_offsets(rec, index); +@param offsets in: rec_get_offsets(rec, index); the "external storage" flags in offsets will not be adjusted -@param big in: vector containing fields to be stored externally @param mtr in: mini-transaction that holds x-latch on index and b @param upd in: TRUE if the record is updated in place (not delete+insert) -@param rmtr in/out: in updates, the mini-transaction that holds rec +@param big in: vector containing fields to be stored externally @return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ #ifdef UNIV_DEBUG -# define btr_store_big_rec_extern_fields(index,b,rec,offs,big,mtr,upd,rmtr) \ - btr_store_big_rec_extern_fields_func(index,b,rec,offs,big,mtr,upd,rmtr) +# define btr_store_big_rec_extern_fields(index,b,rec,offsets,mtr,upd,big) \ + btr_store_big_rec_extern_fields_func(index,b,rec,offsets,mtr,upd,big) #elif defined UNIV_BLOB_LIGHT_DEBUG -# define btr_store_big_rec_extern_fields(index,b,rec,offs,big,mtr,upd,rmtr) \ - btr_store_big_rec_extern_fields_func(index,b,rec,offs,big,upd,rmtr) +# define btr_store_big_rec_extern_fields(index,b,rec,offsets,mtr,upd,big) \ + btr_store_big_rec_extern_fields_func(index,b,rec,offsets,upd,big) #else -# define btr_store_big_rec_extern_fields(index,b,rec,offs,big,mtr,upd,rmtr) \ - btr_store_big_rec_extern_fields_func(index,b,rec,offs,big,rmtr) +# define btr_store_big_rec_extern_fields(index,b,rec,offsets,mtr,upd,big) \ + btr_store_big_rec_extern_fields_func(index,b,rec,offsets,big) #endif /*******************************************************************//** diff --git a/storage/innodb_plugin/include/fsp0fsp.h b/storage/innodb_plugin/include/fsp0fsp.h index 2221380c9a2..403e1d404a8 100644 --- a/storage/innodb_plugin/include/fsp0fsp.h +++ b/storage/innodb_plugin/include/fsp0fsp.h @@ -176,18 +176,19 @@ fseg_n_reserved_pages( Allocates a single free page from a segment. This function implements the intelligent allocation strategy which tries to minimize file space fragmentation. -@param[in/out] seg_header segment header -@param[in] hint hint of which page would be desirable -@param[in] direction if the new page is needed because +@return the allocated page offset FIL_NULL if no page could be allocated */ +UNIV_INTERN +ulint +fseg_alloc_free_page( +/*=================*/ + fseg_header_t* seg_header, /*!< in: segment header */ + ulint hint, /*!< in: hint of which page would be desirable */ + byte direction, /*!< in: if the new page is needed because of an index page split, and records are inserted there in order, into which direction they go alphabetically: FSP_DOWN, - FSP_UP, FSP_NO_DIR -@param[in/out] mtr mini-transaction -@return the allocated page offset FIL_NULL if no page could be allocated */ -#define fseg_alloc_free_page(seg_header, hint, direction, mtr) \ - fseg_alloc_free_page_general(seg_header, hint, direction, \ - FALSE, mtr, mtr) + FSP_UP, FSP_NO_DIR */ + mtr_t* mtr); /*!< in: mtr handle */ /**********************************************************************//** Allocates a single free page from a segment. This function implements the intelligent allocation strategy which tries to minimize file space @@ -209,11 +210,7 @@ fseg_alloc_free_page_general( with fsp_reserve_free_extents, then there is no need to do the check for this individual page */ - mtr_t* mtr, /*!< in/out: mini-transaction */ - mtr_t* init_mtr)/*!< in/out: mtr or another mini-transaction - in which the page should be initialized, - or NULL if this is a "fake allocation" of - a page that was previously freed in mtr */ + mtr_t* mtr) /*!< in/out: mini-transaction */ __attribute__((warn_unused_result, nonnull(1,5))); /**********************************************************************//** Reserves free pages from a tablespace. All mini-transactions which may diff --git a/storage/innodb_plugin/include/mtr0mtr.h b/storage/innodb_plugin/include/mtr0mtr.h index 3529519e7f4..8a9ec8ea7f0 100644 --- a/storage/innodb_plugin/include/mtr0mtr.h +++ b/storage/innodb_plugin/include/mtr0mtr.h @@ -53,8 +53,6 @@ first 3 values must be RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH */ #define MTR_MEMO_MODIFY 54 #define MTR_MEMO_S_LOCK 55 #define MTR_MEMO_X_LOCK 56 -/** The mini-transaction freed a clustered index leaf page. */ -#define MTR_MEMO_FREE_CLUST_LEAF 57 /** @name Log item types The log items are declared 'byte' so that the compiler can warn if val @@ -379,12 +377,9 @@ struct mtr_struct{ #endif dyn_array_t memo; /*!< memo stack for locks etc. */ dyn_array_t log; /*!< mini-transaction log */ - unsigned modifications:1; - /*!< TRUE if the mini-transaction - modified buffer pool pages */ - unsigned freed_clust_leaf:1; - /*!< TRUE if MTR_MEMO_FREE_CLUST_LEAF - was logged in the mini-transaction */ + ibool modifications; + /* TRUE if the mtr made modifications to + buffer pool pages */ ulint n_log_recs; /* count of how many page initial log records have been written to the mtr log */ diff --git a/storage/innodb_plugin/include/mtr0mtr.ic b/storage/innodb_plugin/include/mtr0mtr.ic index 9c0ddff9132..9f92d2b06a1 100644 --- a/storage/innodb_plugin/include/mtr0mtr.ic +++ b/storage/innodb_plugin/include/mtr0mtr.ic @@ -44,7 +44,6 @@ mtr_start( mtr->log_mode = MTR_LOG_ALL; mtr->modifications = FALSE; - mtr->freed_clust_leaf = FALSE; mtr->n_log_recs = 0; ut_d(mtr->state = MTR_ACTIVE); @@ -68,8 +67,7 @@ mtr_memo_push( ut_ad(object); ut_ad(type >= MTR_MEMO_PAGE_S_FIX); - ut_ad(type <= MTR_MEMO_FREE_CLUST_LEAF); - ut_ad(type != MTR_MEMO_FREE_CLUST_LEAF || mtr->freed_clust_leaf); + ut_ad(type <= MTR_MEMO_X_LOCK); ut_ad(mtr); ut_ad(mtr->magic_n == MTR_MAGIC_N); ut_ad(mtr->state == MTR_ACTIVE); diff --git a/storage/innodb_plugin/mtr/mtr0mtr.c b/storage/innodb_plugin/mtr/mtr0mtr.c index e3fefbedec7..5fad61b2922 100644 --- a/storage/innodb_plugin/mtr/mtr0mtr.c +++ b/storage/innodb_plugin/mtr/mtr0mtr.c @@ -58,11 +58,12 @@ mtr_memo_slot_release( buf_page_release((buf_block_t*)object, type, mtr); } else if (type == MTR_MEMO_S_LOCK) { rw_lock_s_unlock((rw_lock_t*)object); +#ifdef UNIV_DEBUG } else if (type != MTR_MEMO_X_LOCK) { - ut_ad(type == MTR_MEMO_MODIFY - || type == MTR_MEMO_FREE_CLUST_LEAF); + ut_ad(type == MTR_MEMO_MODIFY); ut_ad(mtr_memo_contains(mtr, object, MTR_MEMO_PAGE_X_FIX)); +#endif /* UNIV_DEBUG */ } else { rw_lock_x_unlock((rw_lock_t*)object); } diff --git a/storage/innodb_plugin/row/row0ins.c b/storage/innodb_plugin/row/row0ins.c index cd135e8ba8f..f0f6eca627f 100644 --- a/storage/innodb_plugin/row/row0ins.c +++ b/storage/innodb_plugin/row/row0ins.c @@ -2097,20 +2097,15 @@ row_ins_index_entry_low( if (big_rec) { ut_a(err == DB_SUCCESS); /* Write out the externally stored - columns, but allocate the pages and - write the pointers using the - mini-transaction of the record update. - If any pages were freed in the update, - temporarily mark them allocated so - that off-page columns will not - overwrite them. We must do this, - because we will write the redo log for - the BLOB writes before writing the - redo log for the record update. Thus, - redo log application at crash recovery - will see BLOBs being written to free pages. */ - - btr_mark_freed_leaves(index, &mtr, TRUE); + columns while still x-latching + index->lock and block->lock. We have + to mtr_commit(mtr) first, so that the + redo log will be written in the + correct order. Otherwise, we would run + into trouble on crash recovery if mtr + freed B-tree pages on which some of + the big_rec fields will be written. */ + btr_cur_mtr_commit_and_start(&cursor, &mtr); rec = btr_cur_get_rec(&cursor); offsets = rec_get_offsets( @@ -2119,8 +2114,7 @@ row_ins_index_entry_low( err = btr_store_big_rec_extern_fields( index, btr_cur_get_block(&cursor), - rec, offsets, big_rec, &mtr, - FALSE, &mtr); + rec, offsets, &mtr, FALSE, big_rec); /* If writing big_rec fails (for example, because of DB_OUT_OF_FILE_SPACE), the record will be corrupted. Even if @@ -2133,9 +2127,6 @@ row_ins_index_entry_low( undo log, and thus the record cannot be rolled back. */ ut_a(err == DB_SUCCESS); - /* Free the pages again - in order to avoid a leak. */ - btr_mark_freed_leaves(index, &mtr, FALSE); goto stored_big_rec; } } else { @@ -2177,7 +2168,7 @@ function_exit: err = btr_store_big_rec_extern_fields( index, btr_cur_get_block(&cursor), - rec, offsets, big_rec, &mtr, FALSE, NULL); + rec, offsets, &mtr, FALSE, big_rec); stored_big_rec: if (modify) { diff --git a/storage/innodb_plugin/row/row0row.c b/storage/innodb_plugin/row/row0row.c index e476ffae84e..9cdbbe76e04 100644 --- a/storage/innodb_plugin/row/row0row.c +++ b/storage/innodb_plugin/row/row0row.c @@ -243,20 +243,19 @@ row_build( } #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG - if (rec_offs_any_null_extern(rec, offsets)) { - /* This condition can occur during crash recovery - before trx_rollback_active() has completed execution. - - This condition is possible if the server crashed - during an insert or update-by-delete-and-insert before - btr_store_big_rec_extern_fields() did mtr_commit() all - BLOB pointers to the freshly inserted clustered index - record. */ - ut_a(trx_assert_recovered( - row_get_rec_trx_id(rec, index, offsets))); - ut_a(trx_undo_roll_ptr_is_insert( - row_get_rec_roll_ptr(rec, index, offsets))); - } + /* This condition can occur during crash recovery before + trx_rollback_active() has completed execution. + + This condition is possible if the server crashed + during an insert or update before + btr_store_big_rec_extern_fields() did mtr_commit() all + BLOB pointers to the clustered index record. + + If the record contains a null BLOB pointer, look up the + transaction that holds the implicit lock on this record, and + assert that it was recovered (and will soon be rolled back). */ + ut_a(!rec_offs_any_null_extern(rec, offsets) + || trx_assert_recovered(row_get_rec_trx_id(rec, index, offsets))); #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ if (type != ROW_COPY_POINTERS) { diff --git a/storage/innodb_plugin/row/row0upd.c b/storage/innodb_plugin/row/row0upd.c index 05856687015..b5952ff0a78 100644 --- a/storage/innodb_plugin/row/row0upd.c +++ b/storage/innodb_plugin/row/row0upd.c @@ -1978,22 +1978,21 @@ row_upd_clust_rec( rec_offs_init(offsets_); ut_a(err == DB_SUCCESS); - /* Write out the externally stored columns, but - allocate the pages and write the pointers using the - mini-transaction of the record update. If any pages - were freed in the update, temporarily mark them - allocated so that off-page columns will not overwrite - them. We must do this, because we write the redo log - for the BLOB writes before writing the redo log for - the record update. */ - - btr_mark_freed_leaves(index, mtr, TRUE); + /* Write out the externally stored columns while still + x-latching index->lock and block->lock. We have to + mtr_commit(mtr) first, so that the redo log will be + written in the correct order. Otherwise, we would run + into trouble on crash recovery if mtr freed B-tree + pages on which some of the big_rec fields will be + written. */ + btr_cur_mtr_commit_and_start(btr_cur, mtr); + rec = btr_cur_get_rec(btr_cur); err = btr_store_big_rec_extern_fields( index, btr_cur_get_block(btr_cur), rec, rec_get_offsets(rec, index, offsets_, ULINT_UNDEFINED, &heap), - big_rec, mtr, TRUE, mtr); + mtr, TRUE, big_rec); /* If writing big_rec fails (for example, because of DB_OUT_OF_FILE_SPACE), the record will be corrupted. Even if we did not update any externally stored @@ -2003,8 +2002,6 @@ row_upd_clust_rec( to the undo log, and thus the record cannot be rolled back. */ ut_a(err == DB_SUCCESS); - /* Free the pages again in order to avoid a leak. */ - btr_mark_freed_leaves(index, mtr, FALSE); } mtr_commit(mtr); diff --git a/storage/innodb_plugin/trx/trx0undo.c b/storage/innodb_plugin/trx/trx0undo.c index c36f55fbd9c..746f0808643 100644 --- a/storage/innodb_plugin/trx/trx0undo.c +++ b/storage/innodb_plugin/trx/trx0undo.c @@ -912,7 +912,7 @@ trx_undo_add_page( page_no = fseg_alloc_free_page_general(header_page + TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER, undo->top_page_no + 1, FSP_UP, - TRUE, mtr, mtr); + TRUE, mtr); fil_space_release_free_extents(undo->space, n_reserved); |