diff options
author | Mikael Ronstrom <mikael@dator8> | 2011-03-04 12:35:24 +0100 |
---|---|---|
committer | Mikael Ronstrom <mikael@dator8> | 2011-03-04 12:35:24 +0100 |
commit | 0fc7078e53eb2b34503ddd2408844c2fa06d393f (patch) | |
tree | bccbd7ab21bb499f9f924e6f90e9303f3018d1ae /storage | |
parent | 23296fdc18e864b1b016b4c6a47723b26480f9c6 (diff) | |
parent | a4711099716c53490a69c3c17a8b61a4a3282b19 (diff) | |
download | mariadb-git-0fc7078e53eb2b34503ddd2408844c2fa06d393f.tar.gz |
merge
Diffstat (limited to 'storage')
42 files changed, 2007 insertions, 413 deletions
diff --git a/storage/innobase/CMakeLists.txt b/storage/innobase/CMakeLists.txt index 4bbda0d2477..d4b98f2af0d 100644 --- a/storage/innobase/CMakeLists.txt +++ b/storage/innobase/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2006, 2011, Oracle and/or its affiliates. All rights reserved. # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -246,7 +246,7 @@ SET(INNOBASE_SOURCES btr/btr0btr.c btr/btr0cur.c btr/btr0pcur.c btr/btr0sea.c trx/trx0sys.c trx/trx0trx.c trx/trx0undo.c usr/usr0sess.c ut/ut0byte.c ut/ut0dbg.c ut/ut0list.c ut/ut0mem.c ut/ut0rbt.c ut/ut0rnd.c - ut/ut0ut.c ut/ut0vec.c ut/ut0wqueue.c) + ut/ut0ut.c ut/ut0vec.c ut/ut0wqueue.c ut/ut0bh.c) IF(WITH_INNODB) # Legacy option diff --git a/storage/innobase/btr/btr0btr.c b/storage/innobase/btr/btr0btr.c index 2e544c0552a..fb2509a62ff 100644 --- a/storage/innobase/btr/btr0btr.c +++ b/storage/innobase/btr/btr0btr.c @@ -42,6 +42,560 @@ Created 6/2/1994 Heikki Tuuri #include "ibuf0ibuf.h" #include "trx0trx.h" +#ifdef UNIV_BLOB_DEBUG +# include "srv0srv.h" +# include "ut0rbt.h" + +/** TRUE when messages about index->blobs modification are enabled. */ +static ibool btr_blob_dbg_msg; + +/** Issue a message about an operation on index->blobs. +@param op operation +@param b the entry being subjected to the operation +@param ctx the context of the operation */ +#define btr_blob_dbg_msg_issue(op, b, ctx) \ + fprintf(stderr, op " %u:%u:%u->%u %s(%u,%u,%u)\n", \ + (b)->ref_page_no, (b)->ref_heap_no, \ + (b)->ref_field_no, (b)->blob_page_no, ctx, \ + (b)->owner, (b)->always_owner, (b)->del) + +/** Insert to index->blobs a reference to an off-page column. +@param index the index tree +@param b the reference +@param ctx context (for logging) */ +UNIV_INTERN +void +btr_blob_dbg_rbt_insert( +/*====================*/ + dict_index_t* index, /*!< in/out: index tree */ + const btr_blob_dbg_t* b, /*!< in: the reference */ + const char* ctx) /*!< in: context (for logging) */ +{ + if (btr_blob_dbg_msg) { + btr_blob_dbg_msg_issue("insert", b, ctx); + } + mutex_enter(&index->blobs_mutex); + rbt_insert(index->blobs, b, b); + mutex_exit(&index->blobs_mutex); +} + +/** Remove from index->blobs a reference to an off-page column. +@param index the index tree +@param b the reference +@param ctx context (for logging) */ +UNIV_INTERN +void +btr_blob_dbg_rbt_delete( +/*====================*/ + dict_index_t* index, /*!< in/out: index tree */ + const btr_blob_dbg_t* b, /*!< in: the reference */ + const char* ctx) /*!< in: context (for logging) */ +{ + if (btr_blob_dbg_msg) { + btr_blob_dbg_msg_issue("delete", b, ctx); + } + mutex_enter(&index->blobs_mutex); + ut_a(rbt_delete(index->blobs, b)); + mutex_exit(&index->blobs_mutex); +} + +/**************************************************************//** +Comparator for items (btr_blob_dbg_t) in index->blobs. +The key in index->blobs is (ref_page_no, ref_heap_no, ref_field_no). +@return negative, 0 or positive if *a<*b, *a=*b, *a>*b */ +static +int +btr_blob_dbg_cmp( +/*=============*/ + const void* a, /*!< in: first btr_blob_dbg_t to compare */ + const void* b) /*!< in: second btr_blob_dbg_t to compare */ +{ + const btr_blob_dbg_t* aa = a; + const btr_blob_dbg_t* bb = b; + + ut_ad(aa != NULL); + ut_ad(bb != NULL); + + if (aa->ref_page_no != bb->ref_page_no) { + return(aa->ref_page_no < bb->ref_page_no ? -1 : 1); + } + if (aa->ref_heap_no != bb->ref_heap_no) { + return(aa->ref_heap_no < bb->ref_heap_no ? -1 : 1); + } + if (aa->ref_field_no != bb->ref_field_no) { + return(aa->ref_field_no < bb->ref_field_no ? -1 : 1); + } + return(0); +} + +/**************************************************************//** +Add a reference to an off-page column to the index->blobs map. */ +UNIV_INTERN +void +btr_blob_dbg_add_blob( +/*==================*/ + const rec_t* rec, /*!< in: clustered index record */ + ulint field_no, /*!< in: off-page column number */ + ulint page_no, /*!< in: start page of the column */ + dict_index_t* index, /*!< in/out: index tree */ + const char* ctx) /*!< in: context (for logging) */ +{ + btr_blob_dbg_t b; + const page_t* page = page_align(rec); + + ut_a(index->blobs); + + b.blob_page_no = page_no; + b.ref_page_no = page_get_page_no(page); + b.ref_heap_no = page_rec_get_heap_no(rec); + b.ref_field_no = field_no; + ut_a(b.ref_field_no >= index->n_uniq); + b.always_owner = b.owner = TRUE; + b.del = FALSE; + ut_a(!rec_get_deleted_flag(rec, page_is_comp(page))); + btr_blob_dbg_rbt_insert(index, &b, ctx); +} + +/**************************************************************//** +Add to index->blobs any references to off-page columns from a record. +@return number of references added */ +UNIV_INTERN +ulint +btr_blob_dbg_add_rec( +/*=================*/ + const rec_t* rec, /*!< in: record */ + dict_index_t* index, /*!< in/out: index */ + const ulint* offsets,/*!< in: offsets */ + const char* ctx) /*!< in: context (for logging) */ +{ + ulint count = 0; + ulint i; + btr_blob_dbg_t b; + ibool del; + + ut_ad(rec_offs_validate(rec, index, offsets)); + + if (!rec_offs_any_extern(offsets)) { + return(0); + } + + b.ref_page_no = page_get_page_no(page_align(rec)); + b.ref_heap_no = page_rec_get_heap_no(rec); + del = (rec_get_deleted_flag(rec, rec_offs_comp(offsets)) != 0); + + for (i = 0; i < rec_offs_n_fields(offsets); i++) { + if (rec_offs_nth_extern(offsets, i)) { + ulint len; + const byte* field_ref = rec_get_nth_field( + rec, offsets, i, &len); + + ut_a(len != UNIV_SQL_NULL); + ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE); + field_ref += len - BTR_EXTERN_FIELD_REF_SIZE; + + if (!memcmp(field_ref, field_ref_zero, + BTR_EXTERN_FIELD_REF_SIZE)) { + /* the column has not been stored yet */ + continue; + } + + b.ref_field_no = i; + b.blob_page_no = mach_read_from_4( + field_ref + BTR_EXTERN_PAGE_NO); + ut_a(b.ref_field_no >= index->n_uniq); + b.always_owner = b.owner + = !(field_ref[BTR_EXTERN_LEN] + & BTR_EXTERN_OWNER_FLAG); + b.del = del; + + btr_blob_dbg_rbt_insert(index, &b, ctx); + count++; + } + } + + return(count); +} + +/**************************************************************//** +Display the references to off-page columns. +This function is to be called from a debugger, +for example when a breakpoint on ut_dbg_assertion_failed is hit. */ +UNIV_INTERN +void +btr_blob_dbg_print( +/*===============*/ + const dict_index_t* index) /*!< in: index tree */ +{ + const ib_rbt_node_t* node; + + if (!index->blobs) { + return; + } + + /* We intentionally do not acquire index->blobs_mutex here. + This function is to be called from a debugger, and the caller + should make sure that the index->blobs_mutex is held. */ + + for (node = rbt_first(index->blobs); + node != NULL; node = rbt_next(index->blobs, node)) { + const btr_blob_dbg_t* b + = rbt_value(btr_blob_dbg_t, node); + fprintf(stderr, "%u:%u:%u->%u%s%s%s\n", + b->ref_page_no, b->ref_heap_no, b->ref_field_no, + b->blob_page_no, + b->owner ? "" : "(disowned)", + b->always_owner ? "" : "(has disowned)", + b->del ? "(deleted)" : ""); + } +} + +/**************************************************************//** +Remove from index->blobs any references to off-page columns from a record. +@return number of references removed */ +UNIV_INTERN +ulint +btr_blob_dbg_remove_rec( +/*====================*/ + const rec_t* rec, /*!< in: record */ + dict_index_t* index, /*!< in/out: index */ + const ulint* offsets,/*!< in: offsets */ + const char* ctx) /*!< in: context (for logging) */ +{ + ulint i; + ulint count = 0; + btr_blob_dbg_t b; + + ut_ad(rec_offs_validate(rec, index, offsets)); + + if (!rec_offs_any_extern(offsets)) { + return(0); + } + + b.ref_page_no = page_get_page_no(page_align(rec)); + b.ref_heap_no = page_rec_get_heap_no(rec); + + for (i = 0; i < rec_offs_n_fields(offsets); i++) { + if (rec_offs_nth_extern(offsets, i)) { + ulint len; + const byte* field_ref = rec_get_nth_field( + rec, offsets, i, &len); + + ut_a(len != UNIV_SQL_NULL); + ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE); + field_ref += len - BTR_EXTERN_FIELD_REF_SIZE; + + b.ref_field_no = i; + b.blob_page_no = mach_read_from_4( + field_ref + BTR_EXTERN_PAGE_NO); + + switch (b.blob_page_no) { + case 0: + /* The column has not been stored yet. + The BLOB pointer must be all zero. + There cannot be a BLOB starting at + page 0, because page 0 is reserved for + the tablespace header. */ + ut_a(!memcmp(field_ref, field_ref_zero, + BTR_EXTERN_FIELD_REF_SIZE)); + /* fall through */ + case FIL_NULL: + /* the column has been freed already */ + continue; + } + + btr_blob_dbg_rbt_delete(index, &b, ctx); + count++; + } + } + + return(count); +} + +/**************************************************************//** +Check that there are no references to off-page columns from or to +the given page. Invoked when freeing or clearing a page. +@return TRUE when no orphan references exist */ +UNIV_INTERN +ibool +btr_blob_dbg_is_empty( +/*==================*/ + dict_index_t* index, /*!< in: index */ + ulint page_no) /*!< in: page number */ +{ + const ib_rbt_node_t* node; + ibool success = TRUE; + + if (!index->blobs) { + return(success); + } + + mutex_enter(&index->blobs_mutex); + + for (node = rbt_first(index->blobs); + node != NULL; node = rbt_next(index->blobs, node)) { + const btr_blob_dbg_t* b + = rbt_value(btr_blob_dbg_t, node); + + if (b->ref_page_no != page_no && b->blob_page_no != page_no) { + continue; + } + + fprintf(stderr, + "InnoDB: orphan BLOB ref%s%s%s %u:%u:%u->%u\n", + b->owner ? "" : "(disowned)", + b->always_owner ? "" : "(has disowned)", + b->del ? "(deleted)" : "", + b->ref_page_no, b->ref_heap_no, b->ref_field_no, + b->blob_page_no); + + if (b->blob_page_no != page_no || b->owner || !b->del) { + success = FALSE; + } + } + + mutex_exit(&index->blobs_mutex); + return(success); +} + +/**************************************************************//** +Count and process all references to off-page columns on a page. +@return number of references processed */ +UNIV_INTERN +ulint +btr_blob_dbg_op( +/*============*/ + const page_t* page, /*!< in: B-tree leaf page */ + const rec_t* rec, /*!< in: record to start from + (NULL to process the whole page) */ + dict_index_t* index, /*!< in/out: index */ + const char* ctx, /*!< in: context (for logging) */ + const btr_blob_dbg_op_f op) /*!< in: operation on records */ +{ + ulint count = 0; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + ut_a(fil_page_get_type(page) == FIL_PAGE_INDEX); + ut_a(!rec || page_align(rec) == page); + + if (!index->blobs || !page_is_leaf(page) + || !dict_index_is_clust(index)) { + return(0); + } + + if (rec == NULL) { + rec = page_get_infimum_rec(page); + } + + do { + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + count += op(rec, index, offsets, ctx); + rec = page_rec_get_next_const(rec); + } while (!page_rec_is_supremum(rec)); + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + return(count); +} + +/**************************************************************//** +Count and add to index->blobs any references to off-page columns +from records on a page. +@return number of references added */ +UNIV_INTERN +ulint +btr_blob_dbg_add( +/*=============*/ + const page_t* page, /*!< in: rewritten page */ + dict_index_t* index, /*!< in/out: index */ + const char* ctx) /*!< in: context (for logging) */ +{ + btr_blob_dbg_assert_empty(index, page_get_page_no(page)); + + return(btr_blob_dbg_op(page, NULL, index, ctx, btr_blob_dbg_add_rec)); +} + +/**************************************************************//** +Count and remove from index->blobs any references to off-page columns +from records on a page. +Used when reorganizing a page, before copying the records. +@return number of references removed */ +UNIV_INTERN +ulint +btr_blob_dbg_remove( +/*================*/ + const page_t* page, /*!< in: b-tree page */ + dict_index_t* index, /*!< in/out: index */ + const char* ctx) /*!< in: context (for logging) */ +{ + ulint count; + + count = btr_blob_dbg_op(page, NULL, index, ctx, + btr_blob_dbg_remove_rec); + + /* Check that no references exist. */ + btr_blob_dbg_assert_empty(index, page_get_page_no(page)); + + return(count); +} + +/**************************************************************//** +Restore in index->blobs any references to off-page columns +Used when page reorganize fails due to compressed page overflow. */ +UNIV_INTERN +void +btr_blob_dbg_restore( +/*=================*/ + const page_t* npage, /*!< in: page that failed to compress */ + const page_t* page, /*!< in: copy of original page */ + dict_index_t* index, /*!< in/out: index */ + const char* ctx) /*!< in: context (for logging) */ +{ + ulint removed; + ulint added; + + ut_a(page_get_page_no(npage) == page_get_page_no(page)); + ut_a(page_get_space_id(npage) == page_get_space_id(page)); + + removed = btr_blob_dbg_remove(npage, index, ctx); + added = btr_blob_dbg_add(page, index, ctx); + ut_a(added == removed); +} + +/**************************************************************//** +Modify the 'deleted' flag of a record. */ +UNIV_INTERN +void +btr_blob_dbg_set_deleted_flag( +/*==========================*/ + const rec_t* rec, /*!< in: record */ + dict_index_t* index, /*!< in/out: index */ + const ulint* offsets,/*!< in: rec_get_offs(rec, index) */ + ibool del) /*!< in: TRUE=deleted, FALSE=exists */ +{ + const ib_rbt_node_t* node; + btr_blob_dbg_t b; + btr_blob_dbg_t* c; + ulint i; + + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_a(dict_index_is_clust(index)); + ut_a(del == !!del);/* must be FALSE==0 or TRUE==1 */ + + if (!rec_offs_any_extern(offsets) || !index->blobs) { + + return; + } + + b.ref_page_no = page_get_page_no(page_align(rec)); + b.ref_heap_no = page_rec_get_heap_no(rec); + + for (i = 0; i < rec_offs_n_fields(offsets); i++) { + if (rec_offs_nth_extern(offsets, i)) { + ulint len; + const byte* field_ref = rec_get_nth_field( + rec, offsets, i, &len); + + ut_a(len != UNIV_SQL_NULL); + ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE); + field_ref += len - BTR_EXTERN_FIELD_REF_SIZE; + + b.ref_field_no = i; + b.blob_page_no = mach_read_from_4( + field_ref + BTR_EXTERN_PAGE_NO); + + switch (b.blob_page_no) { + case 0: + ut_a(memcmp(field_ref, field_ref_zero, + BTR_EXTERN_FIELD_REF_SIZE)); + /* page number 0 is for the + page allocation bitmap */ + case FIL_NULL: + /* the column has been freed already */ + ut_error; + } + + mutex_enter(&index->blobs_mutex); + node = rbt_lookup(index->blobs, &b); + ut_a(node); + + c = rbt_value(btr_blob_dbg_t, node); + /* The flag should be modified. */ + c->del = del; + if (btr_blob_dbg_msg) { + b = *c; + mutex_exit(&index->blobs_mutex); + btr_blob_dbg_msg_issue("del_mk", &b, ""); + } else { + mutex_exit(&index->blobs_mutex); + } + } + } +} + +/**************************************************************//** +Change the ownership of an off-page column. */ +UNIV_INTERN +void +btr_blob_dbg_owner( +/*===============*/ + const rec_t* rec, /*!< in: record */ + dict_index_t* index, /*!< in/out: index */ + const ulint* offsets,/*!< in: rec_get_offs(rec, index) */ + ulint i, /*!< in: ith field in rec */ + ibool own) /*!< in: TRUE=owned, FALSE=disowned */ +{ + const ib_rbt_node_t* node; + btr_blob_dbg_t b; + const byte* field_ref; + ulint len; + + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_a(rec_offs_nth_extern(offsets, i)); + + field_ref = rec_get_nth_field(rec, offsets, i, &len); + ut_a(len != UNIV_SQL_NULL); + ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE); + field_ref += len - BTR_EXTERN_FIELD_REF_SIZE; + + b.ref_page_no = page_get_page_no(page_align(rec)); + b.ref_heap_no = page_rec_get_heap_no(rec); + b.ref_field_no = i; + b.owner = !(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG); + b.blob_page_no = mach_read_from_4(field_ref + BTR_EXTERN_PAGE_NO); + + ut_a(b.owner == own); + + mutex_enter(&index->blobs_mutex); + node = rbt_lookup(index->blobs, &b); + /* row_ins_clust_index_entry_by_modify() invokes + btr_cur_unmark_extern_fields() also for the newly inserted + references, which are all zero bytes until the columns are stored. + The node lookup must fail if and only if that is the case. */ + ut_a(!memcmp(field_ref, field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE) + == !node); + + if (node) { + btr_blob_dbg_t* c = rbt_value(btr_blob_dbg_t, node); + /* Some code sets ownership from TRUE to TRUE. + We do not allow changing ownership from FALSE to FALSE. */ + ut_a(own || c->owner); + + c->owner = own; + if (!own) { + c->always_owner = FALSE; + } + } + + mutex_exit(&index->blobs_mutex); +} +#endif /* UNIV_BLOB_DEBUG */ + /* Latching strategy of the InnoDB B-tree -------------------------------------- @@ -296,6 +850,7 @@ btr_page_create( page_t* page = buf_block_get_frame(block); ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + btr_blob_dbg_assert_empty(index, buf_block_get_page_no(block)); if (UNIV_LIKELY_NULL(page_zip)) { page_create_zip(block, index, level, mtr); @@ -489,6 +1044,7 @@ btr_page_free_low( modify clock */ buf_block_modify_clock_inc(block); + btr_blob_dbg_assert_empty(index, buf_block_get_page_no(block)); if (dict_index_is_ibuf(index)) { @@ -774,6 +1330,14 @@ btr_create( block = buf_page_get(space, zip_size, page_no, RW_X_LATCH, mtr); } else { +#ifdef UNIV_BLOB_DEBUG + if ((type & DICT_CLUSTERED) && !index->blobs) { + mutex_create(PFS_NOT_INSTRUMENTED, + &index->blobs_mutex, SYNC_ANY_LATCH); + index->blobs = rbt_create(sizeof(btr_blob_dbg_t), + btr_blob_dbg_cmp); + } +#endif /* UNIV_BLOB_DEBUG */ block = fseg_create(space, 0, PAGE_HEADER + PAGE_BTR_SEG_TOP, mtr); } @@ -998,6 +1562,7 @@ btr_page_reorganize_low( block->check_index_page_at_flush = TRUE; #endif /* !UNIV_HOTBACKUP */ + btr_blob_dbg_remove(page, index, "btr_page_reorganize"); /* Recreate the page: note that global data on page (possible segment headers, next page-field, etc.) is preserved intact */ @@ -1026,6 +1591,8 @@ btr_page_reorganize_low( (!page_zip_compress(page_zip, page, index, NULL))) { /* Restore the old page and exit. */ + btr_blob_dbg_restore(page, temp_page, index, + "btr_page_reorganize_compress_fail"); #if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG /* Check that the bytes that we skip are identical. */ @@ -1159,6 +1726,7 @@ btr_page_empty( #endif /* UNIV_ZIP_DEBUG */ btr_search_drop_page_hash_index(block); + btr_blob_dbg_remove(page, index, "btr_page_empty"); /* Recreate the page: note that global data on page (possible segment headers, next page-field, etc.) is preserved intact */ @@ -2499,6 +3067,7 @@ btr_lift_page_up( index); } + btr_blob_dbg_remove(page, index, "btr_lift_page_up"); lock_update_copy_and_discard(father_block, block); /* Go upward to root page, decrementing levels by one. */ @@ -2760,6 +3329,7 @@ err_exit: lock_update_merge_right(merge_block, orig_succ, block); } + btr_blob_dbg_remove(page, index, "btr_compress"); mem_heap_free(heap); if (!dict_index_is_clust(index) && page_is_leaf(merge_page)) { @@ -2990,6 +3560,8 @@ btr_discard_page( block); } + btr_blob_dbg_remove(page, index, "btr_discard_page"); + /* Free the file page */ btr_page_free(index, block, mtr); diff --git a/storage/innobase/btr/btr0cur.c b/storage/innobase/btr/btr0cur.c index 8ba9fd721f5..e182840b1a0 100644 --- a/storage/innobase/btr/btr0cur.c +++ b/storage/innobase/btr/btr0cur.c @@ -2690,6 +2690,7 @@ btr_cur_del_mark_set_clust_rec( page_zip = buf_block_get_page_zip(block); + btr_blob_dbg_set_deleted_flag(rec, index, offsets, val); btr_rec_set_deleted_flag(rec, page_zip, val); trx = thr_get_trx(thr); @@ -3873,6 +3874,8 @@ btr_cur_set_ownership_of_extern_field( } else { mach_write_to_1(data + local_len + BTR_EXTERN_LEN, byte_val); } + + btr_blob_dbg_owner(rec, index, offsets, i, val); } /*******************************************************************//** @@ -4373,6 +4376,11 @@ btr_store_big_rec_extern_fields_func( } if (prev_page_no == FIL_NULL) { + btr_blob_dbg_add_blob( + rec, big_rec_vec->fields[i] + .field_no, page_no, index, + "store"); + mach_write_to_4(field_ref + BTR_EXTERN_SPACE_ID, space_id); @@ -4448,6 +4456,11 @@ next_zip_page: MLOG_4BYTES, &mtr); if (prev_page_no == FIL_NULL) { + btr_blob_dbg_add_blob( + rec, big_rec_vec->fields[i] + .field_no, page_no, index, + "store"); + mlog_write_ulint(field_ref + BTR_EXTERN_SPACE_ID, space_id, @@ -4616,6 +4629,37 @@ btr_free_externally_stored_field( rec_zip_size = 0; } +#ifdef UNIV_BLOB_DEBUG + if (!(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG) + && !((field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_INHERITED_FLAG) + && (rb_ctx == RB_NORMAL || rb_ctx == RB_RECOVERY))) { + /* This off-page column will be freed. + Check that no references remain. */ + + btr_blob_dbg_t b; + + b.blob_page_no = mach_read_from_4( + field_ref + BTR_EXTERN_PAGE_NO); + + if (rec) { + /* Remove the reference from the record to the + BLOB. If the BLOB were not freed, the + reference would be removed when the record is + removed. Freeing the BLOB will overwrite the + BTR_EXTERN_PAGE_NO in the field_ref of the + record with FIL_NULL, which would make the + btr_blob_dbg information inconsistent with the + record. */ + b.ref_page_no = page_get_page_no(page_align(rec)); + b.ref_heap_no = page_rec_get_heap_no(rec); + b.ref_field_no = i; + btr_blob_dbg_rbt_delete(index, &b, "free"); + } + + btr_blob_dbg_assert_empty(index, b.blob_page_no); + } +#endif /* UNIV_BLOB_DEBUG */ + for (;;) { #ifdef UNIV_SYNC_DEBUG buf_block_t* rec_block; diff --git a/storage/innobase/buf/buf0rea.c b/storage/innobase/buf/buf0rea.c index 519724d0b9a..3e36b707585 100644 --- a/storage/innobase/buf/buf0rea.c +++ b/storage/innobase/buf/buf0rea.c @@ -531,7 +531,7 @@ buf_read_ibuf_merge_pages( buf_pool_t* buf_pool; ulint zip_size = fil_space_get_zip_size(space_ids[i]); - buf_pool = buf_pool_get(space_ids[i], space_versions[i]); + buf_pool = buf_pool_get(space_ids[i], page_nos[i]); while (buf_pool->n_pend_reads > buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) { diff --git a/storage/innobase/dict/dict0load.c b/storage/innobase/dict/dict0load.c index 3561e7220ab..14490980bb6 100644 --- a/storage/innobase/dict/dict0load.c +++ b/storage/innobase/dict/dict0load.c @@ -1323,7 +1323,10 @@ ulint dict_load_indexes( /*==============*/ dict_table_t* table, /*!< in/out: table */ - mem_heap_t* heap) /*!< in: memory heap for temporary storage */ + mem_heap_t* heap, /*!< in: memory heap for temporary storage */ + dict_err_ignore_t ignore_err) + /*!< in: error to be ignored when + loading the index definition */ { dict_table_t* sys_indexes; dict_index_t* sys_index; @@ -1406,10 +1409,22 @@ dict_load_indexes( "InnoDB: but the index tree has been freed!\n", index->name, table->name); + if (ignore_err & DICT_ERR_IGNORE_INDEX_ROOT) { + /* If caller can tolerate this error, + we will continue to load the index and + let caller deal with this error. However + mark the index and table corrupted */ + index->corrupted = TRUE; + table->corrupted = TRUE; + fprintf(stderr, + "InnoDB: Index is corrupt but forcing" + " load into data dictionary\n"); + } else { corrupted: - dict_mem_index_free(index); - error = DB_CORRUPTION; - goto func_exit; + dict_mem_index_free(index); + error = DB_CORRUPTION; + goto func_exit; + } } else if (!dict_index_is_clust(index) && NULL == dict_table_get_first_index(table)) { @@ -1618,7 +1633,10 @@ dict_load_table( /*============*/ const char* name, /*!< in: table name in the databasename/tablename format */ - ibool cached) /*!< in: TRUE=add to cache, FALSE=do not */ + ibool cached, /*!< in: TRUE=add to cache, FALSE=do not */ + dict_err_ignore_t ignore_err) + /*!< in: error to be ignored when loading + table and its indexes' definition */ { dict_table_t* table; dict_table_t* sys_tables; @@ -1733,7 +1751,7 @@ err_exit: mem_heap_empty(heap); - err = dict_load_indexes(table, heap); + err = dict_load_indexes(table, heap, ignore_err); /* Initialize table foreign_child value. Its value could be changed when dict_load_foreigns() is called below */ @@ -1810,6 +1828,8 @@ dict_load_table_on_id( ut_ad(mutex_own(&(dict_sys->mutex))); + table = NULL; + /* NOTE that the operation of this function is protected by the dictionary mutex, and therefore no deadlocks can occur with other dictionary operations. */ @@ -1836,15 +1856,17 @@ dict_load_table_on_id( BTR_SEARCH_LEAF, &pcur, &mtr); rec = btr_pcur_get_rec(&pcur); - if (!btr_pcur_is_on_user_rec(&pcur) - || rec_get_deleted_flag(rec, 0)) { + if (!btr_pcur_is_on_user_rec(&pcur)) { /* Not found */ + goto func_exit; + } - btr_pcur_close(&pcur); - mtr_commit(&mtr); - mem_heap_free(heap); - - return(NULL); + /* Find the first record that is not delete marked */ + while (rec_get_deleted_flag(rec, 0)) { + if (!btr_pcur_move_to_next_user_rec(&pcur, &mtr)) { + goto func_exit; + } + rec = btr_pcur_get_rec(&pcur); } /*---------------------------------------------------*/ @@ -1857,20 +1879,15 @@ dict_load_table_on_id( /* Check if the table id in record is the one searched for */ if (table_id != mach_read_from_8(field)) { - - btr_pcur_close(&pcur); - mtr_commit(&mtr); - mem_heap_free(heap); - - return(NULL); + goto func_exit; } /* Now we get the table name from the record */ field = rec_get_nth_field_old(rec, 1, &len); /* Load the table definition to memory */ table = dict_load_table(mem_heap_strdupl(heap, (char*) field, len), - TRUE); - + TRUE, DICT_ERR_IGNORE_NONE); +func_exit: btr_pcur_close(&pcur); mtr_commit(&mtr); mem_heap_free(heap); @@ -1894,7 +1911,7 @@ dict_load_sys_table( heap = mem_heap_create(1000); - dict_load_indexes(table, heap); + dict_load_indexes(table, heap, DICT_ERR_IGNORE_NONE); mem_heap_free(heap); } diff --git a/storage/innobase/dict/dict0mem.c b/storage/innobase/dict/dict0mem.c index 8d3d78f3900..a442a3811d8 100644 --- a/storage/innobase/dict/dict0mem.c +++ b/storage/innobase/dict/dict0mem.c @@ -38,6 +38,9 @@ Created 1/8/1996 Heikki Tuuri #ifndef UNIV_HOTBACKUP # include "lock0lock.h" #endif /* !UNIV_HOTBACKUP */ +#ifdef UNIV_BLOB_DEBUG +# include "ut0rbt.h" +#endif /* UNIV_BLOB_DEBUG */ #define DICT_HEAP_SIZE 100 /*!< initial memory heap size when creating a table or index object */ @@ -380,6 +383,12 @@ dict_mem_index_free( { ut_ad(index); ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); +#ifdef UNIV_BLOB_DEBUG + if (index->blobs) { + mutex_free(&index->blobs_mutex); + rbt_free(index->blobs); + } +#endif /* UNIV_BLOB_DEBUG */ mem_heap_free(index->heap); } diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index 31c7ac38c24..14870b5ee63 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 2000, 2010, MySQL AB & Innobase Oy. All Rights Reserved. +Copyright (c) 2000, 2011, MySQL AB & Innobase Oy. All Rights Reserved. Copyright (c) 2008, 2009 Google Inc. Copyright (c) 2009, Percona Inc. @@ -262,7 +262,7 @@ static PSI_mutex_info all_innodb_mutexes[] = { # endif /* UNIV_MEM_DEBUG */ {&mem_pool_mutex_key, "mem_pool_mutex", 0}, {&mutex_list_mutex_key, "mutex_list_mutex", 0}, - {&purge_sys_mutex_key, "purge_sys_mutex", 0}, + {&purge_sys_bh_mutex_key, "purge_sys_bh_mutex", 0}, {&recv_sys_mutex_key, "recv_sys_mutex", 0}, {&rseg_mutex_key, "rseg_mutex", 0}, # ifdef UNIV_SYNC_DEBUG @@ -10981,16 +10981,23 @@ static MYSQL_SYSVAR_ULONG(io_capacity, srv_io_capacity, static MYSQL_SYSVAR_ULONG(purge_batch_size, srv_purge_batch_size, PLUGIN_VAR_OPCMDARG, - "Number of UNDO logs to purge in one batch from the history list. " - "Default is 20", + "Number of UNDO log pages to purge in one batch from the history list.", NULL, NULL, 20, /* Default setting */ 1, /* Minimum value */ 5000, 0); /* Maximum value */ +static MYSQL_SYSVAR_ULONG(rollback_segments, srv_rollback_segments, + PLUGIN_VAR_OPCMDARG, + "Number of UNDO logs to use.", + NULL, NULL, + 128, /* Default setting */ + 1, /* Minimum value */ + TRX_SYS_N_RSEGS, 0); /* Maximum value */ + static MYSQL_SYSVAR_ULONG(purge_threads, srv_n_purge_threads, PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY, - "Purge threads can be either 0 or 1. Default is 0.", + "Purge threads can be either 0 or 1.", NULL, NULL, 0, /* Default setting */ 0, /* Minimum value */ @@ -11341,6 +11348,7 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(io_capacity), MYSQL_SYSVAR(purge_threads), MYSQL_SYSVAR(purge_batch_size), + MYSQL_SYSVAR(rollback_segments), NULL }; diff --git a/storage/innobase/handler/handler0alter.cc b/storage/innobase/handler/handler0alter.cc index c0bd3121c9c..e0279c13de2 100644 --- a/storage/innobase/handler/handler0alter.cc +++ b/storage/innobase/handler/handler0alter.cc @@ -786,10 +786,6 @@ err_exit: ut_ad(error == DB_SUCCESS); - /* We will need to rebuild index translation table. Set - valid index entry count in the translation table to zero */ - share->idx_trans_tbl.index_count = 0; - /* Commit the data dictionary transaction in order to release the table locks on the system tables. This means that if MySQL crashes while creating a new primary key inside @@ -915,6 +911,14 @@ error: } convert_error: + if (error == DB_SUCCESS) { + /* Build index is successful. We will need to + rebuild index translation table. Reset the + index entry count in the translation table + to zero, so that translation table will be rebuilt */ + share->idx_trans_tbl.index_count = 0; + } + error = convert_error_code_to_mysql(error, innodb_table->flags, user_thd); diff --git a/storage/innobase/include/btr0btr.h b/storage/innobase/include/btr0btr.h index e213c22937f..5b3e166371d 100644 --- a/storage/innobase/include/btr0btr.h +++ b/storage/innobase/include/btr0btr.h @@ -92,6 +92,91 @@ insert/delete buffer when the record is not in the buffer pool. */ buffer when the record is not in the buffer pool. */ #define BTR_DELETE 8192 +#ifdef UNIV_BLOB_DEBUG +# include "ut0rbt.h" +/** An index->blobs entry for keeping track of off-page column references */ +struct btr_blob_dbg_struct +{ + unsigned blob_page_no:32; /*!< first BLOB page number */ + unsigned ref_page_no:32; /*!< referring page number */ + unsigned ref_heap_no:16; /*!< referring heap number */ + unsigned ref_field_no:10; /*!< referring field number */ + unsigned owner:1; /*!< TRUE if BLOB owner */ + unsigned always_owner:1; /*!< TRUE if always + has been the BLOB owner; + reset to TRUE on B-tree + page splits and merges */ + unsigned del:1; /*!< TRUE if currently + delete-marked */ +}; + +/**************************************************************//** +Add a reference to an off-page column to the index->blobs map. */ +UNIV_INTERN +void +btr_blob_dbg_add_blob( +/*==================*/ + const rec_t* rec, /*!< in: clustered index record */ + ulint field_no, /*!< in: number of off-page column */ + ulint page_no, /*!< in: start page of the column */ + dict_index_t* index, /*!< in/out: index tree */ + const char* ctx) /*!< in: context (for logging) */ + __attribute__((nonnull)); +/**************************************************************//** +Display the references to off-page columns. +This function is to be called from a debugger, +for example when a breakpoint on ut_dbg_assertion_failed is hit. */ +UNIV_INTERN +void +btr_blob_dbg_print( +/*===============*/ + const dict_index_t* index) /*!< in: index tree */ + __attribute__((nonnull)); +/**************************************************************//** +Check that there are no references to off-page columns from or to +the given page. Invoked when freeing or clearing a page. +@return TRUE when no orphan references exist */ +UNIV_INTERN +ibool +btr_blob_dbg_is_empty( +/*==================*/ + dict_index_t* index, /*!< in: index */ + ulint page_no) /*!< in: page number */ + __attribute__((nonnull, warn_unused_result)); + +/**************************************************************//** +Modify the 'deleted' flag of a record. */ +UNIV_INTERN +void +btr_blob_dbg_set_deleted_flag( +/*==========================*/ + const rec_t* rec, /*!< in: record */ + dict_index_t* index, /*!< in/out: index */ + const ulint* offsets,/*!< in: rec_get_offs(rec, index) */ + ibool del) /*!< in: TRUE=deleted, FALSE=exists */ + __attribute__((nonnull)); +/**************************************************************//** +Change the ownership of an off-page column. */ +UNIV_INTERN +void +btr_blob_dbg_owner( +/*===============*/ + const rec_t* rec, /*!< in: record */ + dict_index_t* index, /*!< in/out: index */ + const ulint* offsets,/*!< in: rec_get_offs(rec, index) */ + ulint i, /*!< in: ith field in rec */ + ibool own) /*!< in: TRUE=owned, FALSE=disowned */ + __attribute__((nonnull)); +/** Assert that there are no BLOB references to or from the given page. */ +# define btr_blob_dbg_assert_empty(index, page_no) \ + ut_a(btr_blob_dbg_is_empty(index, page_no)) +#else /* UNIV_BLOB_DEBUG */ +# define btr_blob_dbg_add_blob(rec, field_no, page, index, ctx) ((void) 0) +# define btr_blob_dbg_set_deleted_flag(rec, index, offsets, del)((void) 0) +# define btr_blob_dbg_owner(rec, index, offsets, i, val) ((void) 0) +# define btr_blob_dbg_assert_empty(index, page_no) ((void) 0) +#endif /* UNIV_BLOB_DEBUG */ + /**************************************************************//** Gets the root node of a tree and x-latches it. @return root page, x-latched */ diff --git a/storage/innobase/include/btr0types.h b/storage/innobase/include/btr0types.h index ef4a6b04b34..07c06fb18d7 100644 --- a/storage/innobase/include/btr0types.h +++ b/storage/innobase/include/btr0types.h @@ -38,6 +38,131 @@ typedef struct btr_cur_struct btr_cur_t; /** B-tree search information for the adaptive hash index */ typedef struct btr_search_struct btr_search_t; +#ifdef UNIV_BLOB_DEBUG +# include "buf0types.h" +/** An index->blobs entry for keeping track of off-page column references */ +typedef struct btr_blob_dbg_struct btr_blob_dbg_t; + +/** Insert to index->blobs a reference to an off-page column. +@param index the index tree +@param b the reference +@param ctx context (for logging) */ +UNIV_INTERN +void +btr_blob_dbg_rbt_insert( +/*====================*/ + dict_index_t* index, /*!< in/out: index tree */ + const btr_blob_dbg_t* b, /*!< in: the reference */ + const char* ctx) /*!< in: context (for logging) */ + __attribute__((nonnull)); + +/** Remove from index->blobs a reference to an off-page column. +@param index the index tree +@param b the reference +@param ctx context (for logging) */ +UNIV_INTERN +void +btr_blob_dbg_rbt_delete( +/*====================*/ + dict_index_t* index, /*!< in/out: index tree */ + const btr_blob_dbg_t* b, /*!< in: the reference */ + const char* ctx) /*!< in: context (for logging) */ + __attribute__((nonnull)); + +/**************************************************************//** +Add to index->blobs any references to off-page columns from a record. +@return number of references added */ +UNIV_INTERN +ulint +btr_blob_dbg_add_rec( +/*=================*/ + const rec_t* rec, /*!< in: record */ + dict_index_t* index, /*!< in/out: index */ + const ulint* offsets,/*!< in: offsets */ + const char* ctx) /*!< in: context (for logging) */ + __attribute__((nonnull)); +/**************************************************************//** +Remove from index->blobs any references to off-page columns from a record. +@return number of references removed */ +UNIV_INTERN +ulint +btr_blob_dbg_remove_rec( +/*====================*/ + const rec_t* rec, /*!< in: record */ + dict_index_t* index, /*!< in/out: index */ + const ulint* offsets,/*!< in: offsets */ + const char* ctx) /*!< in: context (for logging) */ + __attribute__((nonnull)); +/**************************************************************//** +Count and add to index->blobs any references to off-page columns +from records on a page. +@return number of references added */ +UNIV_INTERN +ulint +btr_blob_dbg_add( +/*=============*/ + const page_t* page, /*!< in: rewritten page */ + dict_index_t* index, /*!< in/out: index */ + const char* ctx) /*!< in: context (for logging) */ + __attribute__((nonnull)); +/**************************************************************//** +Count and remove from index->blobs any references to off-page columns +from records on a page. +Used when reorganizing a page, before copying the records. +@return number of references removed */ +UNIV_INTERN +ulint +btr_blob_dbg_remove( +/*================*/ + const page_t* page, /*!< in: b-tree page */ + dict_index_t* index, /*!< in/out: index */ + const char* ctx) /*!< in: context (for logging) */ + __attribute__((nonnull)); +/**************************************************************//** +Restore in index->blobs any references to off-page columns +Used when page reorganize fails due to compressed page overflow. */ +UNIV_INTERN +void +btr_blob_dbg_restore( +/*=================*/ + const page_t* npage, /*!< in: page that failed to compress */ + const page_t* page, /*!< in: copy of original page */ + dict_index_t* index, /*!< in/out: index */ + const char* ctx) /*!< in: context (for logging) */ + __attribute__((nonnull)); + +/** Operation that processes the BLOB references of an index record +@param[in] rec record on index page +@param[in/out] index the index tree of the record +@param[in] offsets rec_get_offsets(rec,index) +@param[in] ctx context (for logging) +@return number of BLOB references processed */ +typedef ulint (*btr_blob_dbg_op_f) +(const rec_t* rec,dict_index_t* index,const ulint* offsets,const char* ctx); + +/**************************************************************//** +Count and process all references to off-page columns on a page. +@return number of references processed */ +UNIV_INTERN +ulint +btr_blob_dbg_op( +/*============*/ + const page_t* page, /*!< in: B-tree leaf page */ + const rec_t* rec, /*!< in: record to start from + (NULL to process the whole page) */ + dict_index_t* index, /*!< in/out: index */ + const char* ctx, /*!< in: context (for logging) */ + const btr_blob_dbg_op_f op) /*!< in: operation on records */ + __attribute__((nonnull(1,3,4,5))); +#else /* UNIV_BLOB_DEBUG */ +# define btr_blob_dbg_add_rec(rec, index, offsets, ctx) ((void) 0) +# define btr_blob_dbg_add(page, index, ctx) ((void) 0) +# define btr_blob_dbg_remove_rec(rec, index, offsets, ctx) ((void) 0) +# define btr_blob_dbg_remove(page, index, ctx) ((void) 0) +# define btr_blob_dbg_restore(npage, page, index, ctx) ((void) 0) +# define btr_blob_dbg_op(page, rec, index, ctx, op) ((void) 0) +#endif /* UNIV_BLOB_DEBUG */ + /** The size of a reference to data stored on a different page. The reference is stored at the end of the prefix of the field in the index record. */ diff --git a/storage/innobase/include/buf0flu.h b/storage/innobase/include/buf0flu.h index 28d9b90e755..ae27f5dab0e 100644 --- a/storage/innobase/include/buf0flu.h +++ b/storage/innobase/include/buf0flu.h @@ -138,7 +138,18 @@ UNIV_INTERN void buf_flush_wait_batch_end( /*=====================*/ - buf_pool_t* buf_pool, /*!< buffer pool instance */ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + enum buf_flush type); /*!< in: BUF_FLUSH_LRU + or BUF_FLUSH_LIST */ +/******************************************************************//** +Waits until a flush batch of the given type ends. This is called by +a thread that only wants to wait for a flush to end but doesn't do +any flushing itself. */ +UNIV_INTERN +void +buf_flush_wait_batch_end_wait_only( +/*===============================*/ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ enum buf_flush type); /*!< in: BUF_FLUSH_LRU or BUF_FLUSH_LIST */ /********************************************************************//** diff --git a/storage/innobase/include/dict0dict.h b/storage/innobase/include/dict0dict.h index 033c435bf16..d6f2bebae3a 100644 --- a/storage/innobase/include/dict0dict.h +++ b/storage/innobase/include/dict0dict.h @@ -441,6 +441,18 @@ function. @return table, NULL if not found */ UNIV_INLINE dict_table_t* +dict_table_get_low_ignore_err( +/*===========================*/ + const char* table_name, /*!< in: table name */ + dict_err_ignore_t + ignore_err); /*!< in: error to be ignored when + loading a table definition */ +/**********************************************************************//** +Gets a table; loads it to the dictionary cache if necessary. A low-level +function. +@return table, NULL if not found */ +UNIV_INLINE +dict_table_t* dict_table_get_low( /*===============*/ const char* table_name); /*!< in: table name */ diff --git a/storage/innobase/include/dict0dict.ic b/storage/innobase/include/dict0dict.ic index 42f124dedfc..59606af7056 100644 --- a/storage/innobase/include/dict0dict.ic +++ b/storage/innobase/include/dict0dict.ic @@ -828,6 +828,34 @@ dict_table_check_if_in_cache_low( } /**********************************************************************//** +load a table into dictionary cache, ignore any error specified during load; +@return table, NULL if not found */ +UNIV_INLINE +dict_table_t* +dict_table_get_low_ignore_err( +/*==========================*/ + const char* table_name, /*!< in: table name */ + dict_err_ignore_t + ignore_err) /*!< in: error to be ignored when + loading a table definition */ +{ + dict_table_t* table; + + ut_ad(table_name); + ut_ad(mutex_own(&(dict_sys->mutex))); + + table = dict_table_check_if_in_cache_low(table_name); + + if (table == NULL) { + table = dict_load_table(table_name, TRUE, ignore_err); + } + + ut_ad(!table || table->cached); + + return(table); +} + +/**********************************************************************//** Gets a table; loads it to the dictionary cache if necessary. A low-level function. @return table, NULL if not found */ @@ -845,7 +873,7 @@ dict_table_get_low( table = dict_table_check_if_in_cache_low(table_name); if (table == NULL) { - table = dict_load_table(table_name, TRUE); + table = dict_load_table(table_name, TRUE, DICT_ERR_IGNORE_NONE); } ut_ad(!table || table->cached); diff --git a/storage/innobase/include/dict0load.h b/storage/innobase/include/dict0load.h index f009f221f32..51d07f43446 100644 --- a/storage/innobase/include/dict0load.h +++ b/storage/innobase/include/dict0load.h @@ -170,7 +170,10 @@ dict_load_table( /*============*/ const char* name, /*!< in: table name in the databasename/tablename format */ - ibool cached);/*!< in: TRUE=add to cache, FALSE=do not */ + ibool cached, /*!< in: TRUE=add to cache, FALSE=do not */ + dict_err_ignore_t ignore_err); + /*!< in: error to be ignored when loading + table and its indexes' definition */ /***********************************************************************//** Loads a table object based on the table id. @return table; NULL if table does not exist */ diff --git a/storage/innobase/include/dict0mem.h b/storage/innobase/include/dict0mem.h index 5757b79d3ed..75d3b2c3302 100644 --- a/storage/innobase/include/dict0mem.h +++ b/storage/innobase/include/dict0mem.h @@ -361,6 +361,8 @@ struct dict_index_struct{ /*!< TRUE if this index is marked to be dropped in ha_innobase::prepare_drop_index(), otherwise FALSE */ + unsigned corrupted:1; + /*!< TRUE if the index object is corrupted */ dict_field_t* fields; /*!< array of field descriptions */ #ifndef UNIV_HOTBACKUP UT_LIST_NODE_T(dict_index_t) @@ -395,6 +397,13 @@ struct dict_index_struct{ index, or 0 if the index existed when InnoDB was started up */ #endif /* !UNIV_HOTBACKUP */ +#ifdef UNIV_BLOB_DEBUG + mutex_t blobs_mutex; + /*!< mutex protecting blobs */ + void* blobs; /*!< map of (page_no,heap_no,field_no) + to first_blob_page_no; protected by + blobs_mutex; @see btr_blob_dbg_t */ +#endif /* UNIV_BLOB_DEBUG */ #ifdef UNIV_DEBUG ulint magic_n;/*!< magic number */ /** Value of dict_index_struct::magic_n */ @@ -487,6 +496,8 @@ struct dict_table_struct{ to the dictionary cache */ unsigned n_def:10;/*!< number of columns defined so far */ unsigned n_cols:10;/*!< number of columns */ + unsigned corrupted:1; + /*!< TRUE if table is corrupted */ dict_col_t* cols; /*!< array of column descriptions */ const char* col_names; /*!< Column names packed in a character string diff --git a/storage/innobase/include/dict0types.h b/storage/innobase/include/dict0types.h index 687209575c9..8cbd7cd5783 100644 --- a/storage/innobase/include/dict0types.h +++ b/storage/innobase/include/dict0types.h @@ -43,4 +43,18 @@ typedef struct tab_node_struct tab_node_t; typedef ib_id_t table_id_t; typedef ib_id_t index_id_t; +/** Error to ignore when we load table dictionary into memory. However, +the table and index will be marked as "corrupted", and caller will +be responsible to deal with corrupted table or index. +Note: please define the IGNORE_ERR_* as bits, so their value can +be or-ed together */ +enum dict_err_ignore { + DICT_ERR_IGNORE_NONE = 0, /*!< no error to ignore */ + DICT_ERR_IGNORE_INDEX_ROOT = 1, /*!< ignore error if index root + page is FIL_NUL or incorrect value */ + DICT_ERR_IGNORE_ALL = 0xFFFF /*!< ignore all errors */ +}; + +typedef enum dict_err_ignore dict_err_ignore_t; + #endif diff --git a/storage/innobase/include/page0zip.h b/storage/innobase/include/page0zip.h index 574809e5227..00c1d0516e6 100644 --- a/storage/innobase/include/page0zip.h +++ b/storage/innobase/include/page0zip.h @@ -420,7 +420,7 @@ page_zip_copy_recs( const page_t* src, /*!< in: page */ dict_index_t* index, /*!< in: index of the B-tree */ mtr_t* mtr) /*!< in: mini-transaction */ - __attribute__((nonnull(1,2,3,4))); + __attribute__((nonnull)); #endif /* !UNIV_HOTBACKUP */ /**********************************************************************//** diff --git a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h index d600ad4a034..252d1424c63 100644 --- a/storage/innobase/include/srv0srv.h +++ b/storage/innobase/include/srv0srv.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved. +Copyright (c) 1995, 2011, Innobase Oy. All Rights Reserved. Copyright (c) 2008, 2009, Google Inc. Copyright (c) 2009, Percona Inc. @@ -294,9 +294,12 @@ extern ulint srv_log_waits; /* the number of purge threads to use from the worker pool (currently 0 or 1) */ extern ulong srv_n_purge_threads; -/* the number of records to purge in one batch */ +/* the number of pages to purge in one batch */ extern ulong srv_purge_batch_size; +/* the number of rollback segments to use */ +extern ulong srv_rollback_segments; + /* variable that counts amount of data read in total (in bytes) */ extern ulint srv_data_read; diff --git a/storage/innobase/include/sync0sync.h b/storage/innobase/include/sync0sync.h index fb3a24c5ee5..b4f9e21933f 100644 --- a/storage/innobase/include/sync0sync.h +++ b/storage/innobase/include/sync0sync.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved. +Copyright (c) 1995, 2011, Innobase Oy. All Rights Reserved. Copyright (c) 2008, Google Inc. Portions of this file contain modifications contributed and copyrighted by @@ -93,7 +93,7 @@ extern mysql_pfs_key_t mem_hash_mutex_key; # endif /* UNIV_MEM_DEBUG */ extern mysql_pfs_key_t mem_pool_mutex_key; extern mysql_pfs_key_t mutex_list_mutex_key; -extern mysql_pfs_key_t purge_sys_mutex_key; +extern mysql_pfs_key_t purge_sys_bh_mutex_key; extern mysql_pfs_key_t recv_sys_mutex_key; extern mysql_pfs_key_t rseg_mutex_key; # ifdef UNIV_SYNC_DEBUG @@ -637,7 +637,6 @@ or row lock! */ #define SYNC_TREE_NODE_NEW 892 #define SYNC_TREE_NODE_FROM_HASH 891 #define SYNC_TREE_NODE 890 -#define SYNC_PURGE_SYS 810 #define SYNC_PURGE_LATCH 800 #define SYNC_TRX_UNDO 700 #define SYNC_RSEG 600 @@ -659,6 +658,7 @@ or row lock! */ #define SYNC_REC_LOCK 299 #define SYNC_TRX_LOCK_HEAP 298 #define SYNC_TRX_SYS_HEADER 290 +#define SYNC_PURGE_QUEUE 200 #define SYNC_LOG 170 #define SYNC_LOG_FLUSH_ORDER 147 #define SYNC_RECV 168 diff --git a/storage/innobase/include/trx0purge.h b/storage/innobase/include/trx0purge.h index d2730a68a78..0b83a76cab7 100644 --- a/storage/innobase/include/trx0purge.h +++ b/storage/innobase/include/trx0purge.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 1996, 2011, Innobase Oy. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -68,8 +68,9 @@ Creates the global purge system control structure and inits the history mutex. */ UNIV_INTERN void -trx_purge_sys_create(void); -/*======================*/ +trx_purge_sys_create( +/*=================*/ + ib_bh_t* ib_bh); /*!< in/own: UNDO log min binary heap*/ /********************************************************************//** Frees the global purge system control structure. */ UNIV_INTERN @@ -128,20 +129,20 @@ struct trx_purge_struct{ ulint state; /*!< Purge system state */ sess_t* sess; /*!< System session running the purge query */ - trx_t* trx; /*!< System transaction running the purge + trx_t* trx; /*!< System transaction running the + purge query: this trx is not in the trx list of the trx system and it never ends */ que_t* query; /*!< The query graph which will do the parallelized purge operation */ - rw_lock_t latch; /*!< The latch protecting the purge view. - A purge operation must acquire an - x-latch here for the instant at which + rw_lock_t latch; /*!< The latch protecting the purge + view. A purge operation must acquire + an x-latch here for the instant at which it changes the purge view: an undo log operation can prevent this by obtaining an s-latch here. */ read_view_t* view; /*!< The purge will not remove undo logs which are >= this view (purge view) */ - mutex_t mutex; /*!< Mutex protecting the fields below */ ulint n_pages_handled;/*!< Approximate number of undo log pages processed in purge */ ulint handle_limit; /*!< Target of how many pages to get @@ -179,6 +180,11 @@ struct trx_purge_struct{ mem_heap_t* heap; /*!< Temporary storage used during a purge: can be emptied after purge completes */ + /*-----------------------------*/ + ib_bh_t* ib_bh; /*!< Binary min-heap, ordered on + rseg_queue_t::trx_no. It is protected + by the bh_mutex */ + mutex_t bh_mutex; /*!< Mutex protecting ib_bh */ }; #define TRX_PURGE_ON 1 /* purge operation is running */ diff --git a/storage/innobase/include/trx0rseg.h b/storage/innobase/include/trx0rseg.h index a293d9b896e..5acde05de3d 100644 --- a/storage/innobase/include/trx0rseg.h +++ b/storage/innobase/include/trx0rseg.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved. +Copyright (c) 1996, 2011, Innobase Oy. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -113,7 +113,9 @@ void trx_rseg_list_and_array_init( /*=========================*/ trx_sysf_t* sys_header, /*!< in: trx system header */ + ib_bh_t* ib_bh, /*!< in: rseg queue */ mtr_t* mtr); /*!< in: mtr */ + /*************************************************************************** Free's an instance of the rollback segment in memory. */ UNIV_INTERN @@ -180,6 +182,14 @@ struct trx_rseg_struct{ memory objects */ }; +/** For prioritising the rollback segments for purge. */ +struct rseg_queue_struct { + trx_id_t trx_no; /*!< trx_rseg_t::last_trx_no */ + trx_rseg_t* rseg; /*!< Rollback segment */ +}; + +typedef struct rseg_queue_struct rseg_queue_t; + /* Undo log segment slot in a rollback segment header */ /*-------------------------------------------------------------*/ #define TRX_RSEG_SLOT_PAGE_NO 0 /* Page number of the header page of diff --git a/storage/innobase/include/trx0sys.h b/storage/innobase/include/trx0sys.h index 63e3f6be934..dc0ca2285b9 100644 --- a/storage/innobase/include/trx0sys.h +++ b/storage/innobase/include/trx0sys.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved. +Copyright (c) 1996, 2011, Innobase Oy. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -38,6 +38,7 @@ Created 3/26/1996 Heikki Tuuri #include "mem0mem.h" #include "sync0sync.h" #include "ut0lst.h" +#include "ut0bh.h" #include "read0types.h" #include "page0types.h" @@ -221,13 +222,6 @@ UNIV_INLINE trx_id_t trx_sys_get_new_trx_id(void); /*========================*/ -/*****************************************************************//** -Allocates a new transaction number. -@return new, allocated trx number */ -UNIV_INLINE -trx_id_t -trx_sys_get_new_trx_no(void); -/*========================*/ #endif /* !UNIV_HOTBACKUP */ /*****************************************************************//** Writes a trx id to an index page. In case that the id size changes in diff --git a/storage/innobase/include/trx0sys.ic b/storage/innobase/include/trx0sys.ic index 385c7f4f0cc..355f118a1ec 100644 --- a/storage/innobase/include/trx0sys.ic +++ b/storage/innobase/include/trx0sys.ic @@ -369,16 +369,4 @@ trx_sys_get_new_trx_id(void) return(id); } -/*****************************************************************//** -Allocates a new transaction number. -@return new, allocated trx number */ -UNIV_INLINE -trx_id_t -trx_sys_get_new_trx_no(void) -/*========================*/ -{ - ut_ad(mutex_own(&kernel_mutex)); - - return(trx_sys_get_new_trx_id()); -} #endif /* !UNIV_HOTBACKUP */ diff --git a/storage/innobase/include/univ.i b/storage/innobase/include/univ.i index a99b8306eb8..f561226a2de 100644 --- a/storage/innobase/include/univ.i +++ b/storage/innobase/include/univ.i @@ -51,7 +51,7 @@ Created 1/20/1994 Heikki Tuuri #define INNODB_VERSION_MAJOR 1 #define INNODB_VERSION_MINOR 1 -#define INNODB_VERSION_BUGFIX 5 +#define INNODB_VERSION_BUGFIX 6 /* The following is the InnoDB version as shown in SELECT plugin_version FROM information_schema.plugins; @@ -201,6 +201,8 @@ this will break redo log file compatibility, but it may be useful when debugging redo log application problems. */ #define UNIV_MEM_DEBUG /* detect memory leaks etc */ #define UNIV_IBUF_DEBUG /* debug the insert buffer */ +#define UNIV_BLOB_DEBUG /* track BLOB ownership; +assumes that no BLOBs survive server restart */ #define UNIV_IBUF_COUNT_DEBUG /* debug the insert buffer; this limits the database to IBUF_COUNT_N_SPACES and IBUF_COUNT_N_PAGES, and the insert buffer must be empty when the database is started */ diff --git a/storage/innobase/include/ut0bh.h b/storage/innobase/include/ut0bh.h new file mode 100644 index 00000000000..1b211390283 --- /dev/null +++ b/storage/innobase/include/ut0bh.h @@ -0,0 +1,152 @@ +/***************************************************************************//** + +Copyright (c) 2011, Oracle Corpn. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/ut0bh.h +Binary min-heap interface. + +Created 2010-05-28 by Sunny Bains +*******************************************************/ + +#ifndef INNOBASE_UT0BH_H +#define INNOBASE_UT0BH_H + +#include "univ.i" + +/** Comparison function for objects in the binary heap. */ +typedef int (*ib_bh_cmp_t)(const void* p1, const void* p2); + +typedef struct ib_bh_struct ib_bh_t; + +/**********************************************************************//** +Get the number of elements in the binary heap. +@return number of elements */ +UNIV_INLINE +ulint +ib_bh_size( +/*=======*/ + const ib_bh_t* ib_bh); /*!< in: instance */ + +/**********************************************************************//** +Test if binary heap is empty. +@return TRUE if empty. */ +UNIV_INLINE +ibool +ib_bh_is_empty( +/*===========*/ + const ib_bh_t* ib_bh); /*!< in: instance */ + +/**********************************************************************//** +Test if binary heap is full. +@return TRUE if full. */ +UNIV_INLINE +ibool +ib_bh_is_full( +/*===========*/ + const ib_bh_t* ib_bh); /*!< in: instance */ + +/**********************************************************************//** +Get a pointer to the element. +@return pointer to element */ +UNIV_INLINE +void* +ib_bh_get( +/*=======*/ + ib_bh_t* ib_bh, /*!< in: instance */ + ulint i); /*!< in: index */ + +/**********************************************************************//** +Copy an element to the binary heap. +@return pointer to copied element */ +UNIV_INLINE +void* +ib_bh_set( +/*======*/ + ib_bh_t* ib_bh, /*!< in/out: instance */ + ulint i, /*!< in: index */ + const void* elem); /*!< in: element to add */ + +/**********************************************************************//** +Return the first element from the binary heap. +@return pointer to first element or NULL if empty. */ +UNIV_INLINE +void* +ib_bh_first( +/*========*/ + ib_bh_t* ib_bh); /*!< in: instance */ + +/**********************************************************************//** +Return the last element from the binary heap. +@return pointer to last element or NULL if empty. */ +UNIV_INLINE +void* +ib_bh_last( +/*========*/ + ib_bh_t* ib_bh); /*!< in/out: instance */ + +/**********************************************************************//** +Create a binary heap. +@return a new binary heap */ +UNIV_INTERN +ib_bh_t* +ib_bh_create( +/*=========*/ + ib_bh_cmp_t compare, /*!< in: comparator */ + ulint sizeof_elem, /*!< in: size of one element */ + ulint max_elems); /*!< in: max elements allowed */ + +/**********************************************************************//** +Free a binary heap. +@return a new binary heap */ +UNIV_INTERN +void +ib_bh_free( +/*=======*/ + ib_bh_t* ib_bh); /*!< in,own: instance */ + +/**********************************************************************//** +Add an element to the binary heap. Note: The element is copied. +@return pointer to added element or NULL if full. */ +UNIV_INTERN +void* +ib_bh_push( +/*=======*/ + ib_bh_t* ib_bh, /*!< in/out: instance */ + const void* elem); /*!< in: element to add */ + +/**********************************************************************//** +Remove the first element from the binary heap. */ +UNIV_INTERN +void +ib_bh_pop( +/*======*/ + ib_bh_t* ib_bh); /*!< in/out: instance */ + +/** Binary heap data structure */ +struct ib_bh_struct { + ulint max_elems; /*!< max elements allowed */ + ulint n_elems; /*!< current size */ + ulint sizeof_elem; /*!< sizeof element */ + ib_bh_cmp_t compare; /*!< comparator */ +}; + +#ifndef UNIV_NONINL +#include "ut0bh.ic" +#endif + +#endif /* INNOBASE_UT0BH_H */ diff --git a/storage/innobase/include/ut0bh.ic b/storage/innobase/include/ut0bh.ic new file mode 100644 index 00000000000..afbe58e7e3b --- /dev/null +++ b/storage/innobase/include/ut0bh.ic @@ -0,0 +1,125 @@ +/***************************************************************************//** +Copyright (c) 2011, Oracle Corpn. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/ut0bh.ic +Binary min-heap implementation. + +Created 2011-01-15 by Sunny Bains +*******************************************************/ + +#include "ut0bh.h" +#include "ut0mem.h" /* For ut_memcpy() */ + +/**********************************************************************//** +Get the number of elements in the binary heap. +@return number of elements */ +UNIV_INLINE +ulint +ib_bh_size( +/*=======*/ + const ib_bh_t* ib_bh) /*!< in: instance */ +{ + return(ib_bh->n_elems); +} + +/**********************************************************************//** +Test if binary heap is empty. +@return TRUE if empty. */ +UNIV_INLINE +ibool +ib_bh_is_empty( +/*===========*/ + const ib_bh_t* ib_bh) /*!< in: instance */ +{ + return(ib_bh_size(ib_bh) == 0); +} + +/**********************************************************************//** +Test if binary heap is full. +@return TRUE if full. */ +UNIV_INLINE +ibool +ib_bh_is_full( +/*===========*/ + const ib_bh_t* ib_bh) /*!< in: instance */ +{ + return(ib_bh_size(ib_bh) >= ib_bh->max_elems); +} + +/**********************************************************************//** +Get a pointer to the element. +@return pointer to element */ +UNIV_INLINE +void* +ib_bh_get( +/*=======*/ + ib_bh_t* ib_bh, /*!< in: instance */ + ulint i) /*!< in: index */ +{ + byte* ptr = (byte*) (ib_bh + 1); + + ut_a(i < ib_bh_size(ib_bh)); + + return(ptr + (ib_bh->sizeof_elem * i)); +} + +/**********************************************************************//** +Copy an element to the binary heap. +@return pointer to copied element */ +UNIV_INLINE +void* +ib_bh_set( +/*======*/ + ib_bh_t* ib_bh, /*!< in/out: instance */ + ulint i, /*!< in: index */ + const void* elem) /*!< in: element to add */ +{ + void* ptr = ib_bh_get(ib_bh, i); + + ut_memcpy(ptr, elem, ib_bh->sizeof_elem); + + return(ptr); +} + +/**********************************************************************//** +Return the first element from the binary heap. +@return pointer to first element or NULL if empty. */ +UNIV_INLINE +void* +ib_bh_first( +/*========*/ + ib_bh_t* ib_bh) /*!< in: instance */ +{ + return(ib_bh_is_empty(ib_bh) ? NULL : ib_bh_get(ib_bh, 0)); +} + +/**********************************************************************//** +Return the last element from the binary heap. +@return pointer to last element or NULL if empty. */ +UNIV_INLINE +void* +ib_bh_last( +/*========*/ + ib_bh_t* ib_bh) /*!< in/out: instance */ +{ + return(ib_bh_is_empty(ib_bh) + ? NULL + : ib_bh_get(ib_bh, ib_bh_size(ib_bh) - 1)); +} + + diff --git a/storage/innobase/page/page0cur.c b/storage/innobase/page/page0cur.c index f10f16a7dd9..936762b986a 100644 --- a/storage/innobase/page/page0cur.c +++ b/storage/innobase/page/page0cur.c @@ -1149,6 +1149,8 @@ use_heap: current_rec, index, mtr); } + btr_blob_dbg_add_rec(insert_rec, index, offsets, "insert"); + return(insert_rec); } @@ -1195,10 +1197,12 @@ page_cur_insert_rec_zip_reorg( } /* Out of space: restore the page */ + btr_blob_dbg_remove(page, index, "insert_zip_fail"); if (!page_zip_decompress(page_zip, page, FALSE)) { ut_error; /* Memory corrupted? */ } ut_ad(page_validate(page, index)); + btr_blob_dbg_add(page, index, "insert_zip_fail"); return(NULL); } @@ -1490,6 +1494,8 @@ use_heap: page_zip_write_rec(page_zip, insert_rec, index, offsets, 1); + btr_blob_dbg_add_rec(insert_rec, index, offsets, "insert_zip_ok"); + /* 9. Write log record of the insert */ if (UNIV_LIKELY(mtr != NULL)) { page_cur_insert_rec_write_log(insert_rec, rec_size, @@ -1697,6 +1703,9 @@ page_copy_rec_list_end_to_created_page( heap_top += rec_size; + rec_offs_make_valid(insert_rec, index, offsets); + btr_blob_dbg_add_rec(insert_rec, index, offsets, "copy_end"); + page_cur_insert_rec_write_log(insert_rec, rec_size, prev_rec, index, mtr); prev_rec = insert_rec; @@ -1944,6 +1953,7 @@ page_cur_delete_rec( page_dir_slot_set_n_owned(cur_dir_slot, page_zip, cur_n_owned - 1); /* 6. Free the memory occupied by the record */ + btr_blob_dbg_remove_rec(current_rec, index, offsets, "delete"); page_mem_free(page, page_zip, current_rec, index, offsets); /* 7. Now we have decremented the number of owned records of the slot. diff --git a/storage/innobase/page/page0page.c b/storage/innobase/page/page0page.c index 2e785412ac9..99182e85849 100644 --- a/storage/innobase/page/page0page.c +++ b/storage/innobase/page/page0page.c @@ -685,12 +685,16 @@ page_copy_rec_list_end( if (UNIV_UNLIKELY (!page_zip_reorganize(new_block, index, mtr))) { + btr_blob_dbg_remove(new_page, index, + "copy_end_reorg_fail"); if (UNIV_UNLIKELY (!page_zip_decompress(new_page_zip, new_page, FALSE))) { ut_error; } ut_ad(page_validate(new_page, index)); + btr_blob_dbg_add(new_page, index, + "copy_end_reorg_fail"); return(NULL); } else { /* The page was reorganized: @@ -803,12 +807,16 @@ page_copy_rec_list_start( if (UNIV_UNLIKELY (!page_zip_reorganize(new_block, index, mtr))) { + btr_blob_dbg_remove(new_page, index, + "copy_start_reorg_fail"); if (UNIV_UNLIKELY (!page_zip_decompress(new_page_zip, new_page, FALSE))) { ut_error; } ut_ad(page_validate(new_page, index)); + btr_blob_dbg_add(new_page, index, + "copy_start_reorg_fail"); return(NULL); } else { /* The page was reorganized: @@ -1080,6 +1088,9 @@ page_delete_rec_list_end( /* Remove the record chain segment from the record chain */ page_rec_set_next(prev_rec, page_get_supremum_rec(page)); + btr_blob_dbg_op(page, rec, index, "delete_end", + btr_blob_dbg_remove_rec); + /* Catenate the deleted chain segment to the page free list */ page_rec_set_next(last_rec, page_header_get_ptr(page, PAGE_FREE)); diff --git a/storage/innobase/page/page0zip.c b/storage/innobase/page/page0zip.c index 4c5371370da..704b0c88e02 100644 --- a/storage/innobase/page/page0zip.c +++ b/storage/innobase/page/page0zip.c @@ -4452,6 +4452,8 @@ page_zip_reorganize( /* Copy the old page to temporary space */ buf_frame_copy(temp_page, page); + btr_blob_dbg_remove(page, index, "zip_reorg"); + /* Recreate the page: note that global data on page (possible segment headers, next page-field, etc.) is preserved intact */ @@ -4510,7 +4512,7 @@ page_zip_copy_recs( mtr_t* mtr) /*!< in: mini-transaction */ { ut_ad(mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_X_FIX)); - ut_ad(mtr_memo_contains_page(mtr, (page_t*) src, MTR_MEMO_PAGE_X_FIX)); + ut_ad(mtr_memo_contains_page(mtr, src, MTR_MEMO_PAGE_X_FIX)); ut_ad(!dict_index_is_ibuf(index)); #ifdef UNIV_ZIP_DEBUG /* The B-tree operations that call this function may set @@ -4580,6 +4582,7 @@ page_zip_copy_recs( #ifdef UNIV_ZIP_DEBUG ut_a(page_zip_validate(page_zip, page)); #endif /* UNIV_ZIP_DEBUG */ + btr_blob_dbg_add(page, index, "page_zip_copy_recs"); page_zip_compress_write_log(page_zip, page, index, mtr); } diff --git a/storage/innobase/row/row0mysql.c b/storage/innobase/row/row0mysql.c index 31aa49d4267..f7e5c5fdceb 100644 --- a/storage/innobase/row/row0mysql.c +++ b/storage/innobase/row/row0mysql.c @@ -3120,7 +3120,7 @@ row_drop_table_for_mysql( ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); #endif /* UNIV_SYNC_DEBUG */ - table = dict_table_get_low(name); + table = dict_table_get_low_ignore_err(name, DICT_ERR_IGNORE_INDEX_ROOT); if (!table) { err = DB_TABLE_NOT_FOUND; @@ -3355,7 +3355,7 @@ check_next_foreign: dict_table_remove_from_cache(table); - if (dict_load_table(name, TRUE) != NULL) { + if (dict_load_table(name, TRUE, DICT_ERR_IGNORE_NONE) != NULL) { ut_print_timestamp(stderr); fputs(" InnoDB: Error: not able to remove table ", stderr); @@ -3501,7 +3501,7 @@ row_mysql_drop_temp_tables(void) btr_pcur_store_position(&pcur, &mtr); btr_pcur_commit_specify_mtr(&pcur, &mtr); - table = dict_load_table(table_name, TRUE); + table = dict_load_table(table_name, TRUE, DICT_ERR_IGNORE_NONE); if (table) { row_drop_table_for_mysql(table_name, trx, FALSE); diff --git a/storage/innobase/row/row0upd.c b/storage/innobase/row/row0upd.c index c6bff0fccd6..ea71a16bed2 100644 --- a/storage/innobase/row/row0upd.c +++ b/storage/innobase/row/row0upd.c @@ -498,14 +498,49 @@ row_upd_rec_in_place( n_fields = upd_get_n_fields(update); for (i = 0; i < n_fields; i++) { +#ifdef UNIV_BLOB_DEBUG + btr_blob_dbg_t b; + const byte* field_ref = NULL; +#endif /* UNIV_BLOB_DEBUG */ + upd_field = upd_get_nth_field(update, i); new_val = &(upd_field->new_val); ut_ad(!dfield_is_ext(new_val) == !rec_offs_nth_extern(offsets, upd_field->field_no)); +#ifdef UNIV_BLOB_DEBUG + if (dfield_is_ext(new_val)) { + ulint len; + field_ref = rec_get_nth_field(rec, offsets, i, &len); + ut_a(len != UNIV_SQL_NULL); + ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE); + field_ref += len - BTR_EXTERN_FIELD_REF_SIZE; + + b.ref_page_no = page_get_page_no(page_align(rec)); + b.ref_heap_no = page_rec_get_heap_no(rec); + b.ref_field_no = i; + b.blob_page_no = mach_read_from_4( + field_ref + BTR_EXTERN_PAGE_NO); + ut_a(b.ref_field_no >= index->n_uniq); + btr_blob_dbg_rbt_delete(index, &b, "upd_in_place"); + } +#endif /* UNIV_BLOB_DEBUG */ rec_set_nth_field(rec, offsets, upd_field->field_no, dfield_get_data(new_val), dfield_get_len(new_val)); + +#ifdef UNIV_BLOB_DEBUG + if (dfield_is_ext(new_val)) { + b.blob_page_no = mach_read_from_4( + field_ref + BTR_EXTERN_PAGE_NO); + b.always_owner = b.owner = !(field_ref[BTR_EXTERN_LEN] + & BTR_EXTERN_OWNER_FLAG); + b.del = rec_get_deleted_flag( + rec, rec_offs_comp(offsets)); + + btr_blob_dbg_rbt_insert(index, &b, "upd_in_place"); + } +#endif /* UNIV_BLOB_DEBUG */ } if (UNIV_LIKELY_NULL(page_zip)) { diff --git a/storage/innobase/srv/srv0srv.c b/storage/innobase/srv/srv0srv.c index dcfd5463fbc..3e10aa03096 100644 --- a/storage/innobase/srv/srv0srv.c +++ b/storage/innobase/srv/srv0srv.c @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved. +Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2008, 2009 Google Inc. Copyright (c) 2009, Percona Inc. @@ -269,9 +269,12 @@ UNIV_INTERN ulong srv_max_buf_pool_modified_pct = 75; /* the number of purge threads to use from the worker pool (currently 0 or 1).*/ UNIV_INTERN ulong srv_n_purge_threads = 0; -/* the number of records to purge in one batch */ +/* the number of pages to purge in one batch */ UNIV_INTERN ulong srv_purge_batch_size = 20; +/* the number of rollback segments to use */ +UNIV_INTERN ulong srv_rollback_segments = TRX_SYS_N_RSEGS; + /* variable counts amount of data read in total (in bytes) */ UNIV_INTERN ulint srv_data_read = 0; @@ -3096,6 +3099,7 @@ srv_purge_thread( required by os_thread_create */ { srv_slot_t* slot; + ulint retries = 0; ulint slot_no = ULINT_UNDEFINED; ulint n_total_purged = ULINT_UNDEFINED; @@ -3122,7 +3126,7 @@ srv_purge_thread( while (srv_shutdown_state != SRV_SHUTDOWN_EXIT_THREADS) { - ulint n_pages_purged; + ulint n_pages_purged = 0; /* If there are very few records to purge or the last purge didn't purge any records then wait for activity. @@ -3130,7 +3134,8 @@ srv_purge_thread( because in the worst case we will end up waiting for the next purge event. */ if (trx_sys->rseg_history_len < srv_purge_batch_size - || n_total_purged == 0) { + || (n_total_purged == 0 + && retries >= TRX_SYS_N_RSEGS)) { os_event_t event; @@ -3141,6 +3146,8 @@ srv_purge_thread( mutex_exit(&kernel_mutex); os_event_wait(event); + + retries = 0; } /* Check for shutdown and whether we should do purge at all. */ @@ -3151,7 +3158,12 @@ srv_purge_thread( break; } - n_total_purged = 0; + if (n_total_purged == 0 && retries <= TRX_SYS_N_RSEGS) { + ++retries; + } else if (n_total_purged > 0) { + retries = 0; + n_total_purged = 0; + } /* Purge until there are no more records to purge and there is no change in configuration or server state. */ diff --git a/storage/innobase/srv/srv0start.c b/storage/innobase/srv/srv0start.c index 2a9ad0e7541..050fe460652 100644 --- a/storage/innobase/srv/srv0start.c +++ b/storage/innobase/srv/srv0start.c @@ -1083,6 +1083,12 @@ innobase_start_or_create_for_mysql(void) # endif #endif +#ifdef UNIV_BLOB_DEBUG + fprintf(stderr, + "InnoDB: !!!!!!!! UNIV_BLOB_DEBUG switched on !!!!!!!!!\n" + "InnoDB: Server restart may fail with UNIV_BLOB_DEBUG\n"); +#endif /* UNIV_BLOB_DEBUG */ + #ifdef UNIV_SYNC_DEBUG ut_print_timestamp(stderr); fprintf(stderr, diff --git a/storage/innobase/sync/sync0rw.c b/storage/innobase/sync/sync0rw.c index 295a010f9d7..73d1e3aa46e 100644 --- a/storage/innobase/sync/sync0rw.c +++ b/storage/innobase/sync/sync0rw.c @@ -271,6 +271,9 @@ rw_lock_create_func( contains garbage at initialization and cannot be used for recursive x-locking. */ lock->recursive = FALSE; + /* Silence Valgrind when UNIV_DEBUG_VALGRIND is not enabled. */ + memset((void*) &lock->writer_thread, 0, sizeof lock->writer_thread); + UNIV_MEM_INVALID(&lock->writer_thread, sizeof lock->writer_thread); #ifdef UNIV_SYNC_DEBUG UT_LIST_INIT(lock->debug_list); diff --git a/storage/innobase/sync/sync0sync.c b/storage/innobase/sync/sync0sync.c index 453314f465d..0f6a60ca260 100644 --- a/storage/innobase/sync/sync0sync.c +++ b/storage/innobase/sync/sync0sync.c @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved. +Copyright (c) 1995, 2011, Innobase Oy. All Rights Reserved. Copyright (c) 2008, Google Inc. Portions of this file contain modifications contributed and copyrighted by @@ -1172,7 +1172,7 @@ sync_thread_add_level( case SYNC_RSEG: case SYNC_TRX_UNDO: case SYNC_PURGE_LATCH: - case SYNC_PURGE_SYS: + case SYNC_PURGE_QUEUE: case SYNC_DICT_AUTOINC_MUTEX: case SYNC_DICT_OPERATION: case SYNC_DICT_HEADER: @@ -1239,10 +1239,16 @@ sync_thread_add_level( || sync_thread_levels_g(array, SYNC_FSP, TRUE)); break; case SYNC_TRX_UNDO_PAGE: + /* Purge is allowed to read in as many UNDO pages as it likes, + there was a bogus rule here earlier that forced the caller to + acquire the purge_sys_t::mutex. The purge mutex did not really + protect anything because it was only ever acquired by the + single purge thread. The purge thread can read the UNDO pages + without any covering mutex. */ + ut_a(sync_thread_levels_contain(array, SYNC_TRX_UNDO) || sync_thread_levels_contain(array, SYNC_RSEG) - || sync_thread_levels_contain(array, SYNC_PURGE_SYS) - || sync_thread_levels_g(array, SYNC_TRX_UNDO_PAGE, TRUE)); + || sync_thread_levels_g(array, level - 1, TRUE)); break; case SYNC_RSEG_HEADER: ut_a(sync_thread_levels_contain(array, SYNC_RSEG)); diff --git a/storage/innobase/trx/trx0purge.c b/storage/innobase/trx/trx0purge.c index 4c787579a03..02ec9f1c072 100644 --- a/storage/innobase/trx/trx0purge.c +++ b/storage/innobase/trx/trx0purge.c @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 1996, 2011, Innobase Oy. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -57,8 +57,8 @@ UNIV_INTERN mysql_pfs_key_t trx_purge_latch_key; #endif /* UNIV_PFS_RWLOCK */ #ifdef UNIV_PFS_MUTEX -/* Key to register purge_sys_mutex with performance schema */ -UNIV_INTERN mysql_pfs_key_t purge_sys_mutex_key; +/* Key to register purge_sys_bh_mutex with performance schema */ +UNIV_INTERN mysql_pfs_key_t purge_sys_bh_mutex_key; #endif /* UNIV_PFS_MUTEX */ /*****************************************************************//** @@ -219,13 +219,16 @@ Creates the global purge system control structure and inits the history mutex. */ UNIV_INTERN void -trx_purge_sys_create(void) -/*======================*/ +trx_purge_sys_create( +/*=================*/ + ib_bh_t* ib_bh) /*!< in, own: UNDO log min binary heap */ { ut_ad(mutex_own(&kernel_mutex)); - purge_sys = mem_alloc(sizeof(trx_purge_t)); + purge_sys = mem_zalloc(sizeof(trx_purge_t)); + /* Take ownership of ib_bh, we are responsible for freeing it. */ + purge_sys->ib_bh = ib_bh; purge_sys->state = TRX_STOP_PURGE; purge_sys->n_pages_handled = 0; @@ -237,8 +240,9 @@ trx_purge_sys_create(void) rw_lock_create(trx_purge_latch_key, &purge_sys->latch, SYNC_PURGE_LATCH); - mutex_create(purge_sys_mutex_key, - &purge_sys->mutex, SYNC_PURGE_SYS); + mutex_create( + purge_sys_bh_mutex_key, &purge_sys->bh_mutex, + SYNC_PURGE_QUEUE); purge_sys->heap = mem_heap_create(256); @@ -288,9 +292,12 @@ trx_purge_sys_close(void) trx_undo_arr_free(purge_sys->arr); rw_lock_free(&purge_sys->latch); - mutex_free(&purge_sys->mutex); + mutex_free(&purge_sys->bh_mutex); mem_heap_free(purge_sys->heap); + + ib_bh_free(purge_sys->ib_bh); + mem_free(purge_sys); purge_sys = NULL; @@ -311,34 +318,31 @@ trx_purge_add_update_undo_to_history( mtr_t* mtr) /*!< in: mtr */ { trx_undo_t* undo; - trx_rseg_t* rseg; trx_rsegf_t* rseg_header; -#ifdef UNIV_DEBUG - trx_usegf_t* seg_header; -#endif /* UNIV_DEBUG */ trx_ulogf_t* undo_header; - ulint hist_size; undo = trx->update_undo; ut_ad(undo); - rseg = undo->rseg; + ut_ad(mutex_own(&undo->rseg->mutex)); - ut_ad(mutex_own(&(rseg->mutex))); - - rseg_header = trx_rsegf_get(rseg->space, rseg->zip_size, - rseg->page_no, mtr); + rseg_header = trx_rsegf_get( + undo->rseg->space, undo->rseg->zip_size, undo->rseg->page_no, + mtr); undo_header = undo_page + undo->hdr_offset; + /* Add the log as the first in the history list */ + + if (undo->state != TRX_UNDO_CACHED) { + ulint hist_size; #ifdef UNIV_DEBUG - seg_header = undo_page + TRX_UNDO_SEG_HDR; + trx_usegf_t* seg_header = undo_page + TRX_UNDO_SEG_HDR; #endif /* UNIV_DEBUG */ - if (undo->state != TRX_UNDO_CACHED) { /* The undo log segment will not be reused */ - if (undo->id >= TRX_RSEG_N_SLOTS) { + if (UNIV_UNLIKELY(undo->id >= TRX_RSEG_N_SLOTS)) { fprintf(stderr, "InnoDB: Error: undo->id is %lu\n", (ulong) undo->id); @@ -347,42 +351,50 @@ trx_purge_add_update_undo_to_history( trx_rsegf_set_nth_undo(rseg_header, undo->id, FIL_NULL, mtr); - hist_size = mtr_read_ulint(rseg_header + TRX_RSEG_HISTORY_SIZE, - MLOG_4BYTES, mtr); + hist_size = mtr_read_ulint( + rseg_header + TRX_RSEG_HISTORY_SIZE, MLOG_4BYTES, mtr); + ut_ad(undo->size == flst_get_len( seg_header + TRX_UNDO_PAGE_LIST, mtr)); - mlog_write_ulint(rseg_header + TRX_RSEG_HISTORY_SIZE, - hist_size + undo->size, MLOG_4BYTES, mtr); + mlog_write_ulint( + rseg_header + TRX_RSEG_HISTORY_SIZE, + hist_size + undo->size, MLOG_4BYTES, mtr); } - /* Add the log as the first in the history list */ - flst_add_first(rseg_header + TRX_RSEG_HISTORY, - undo_header + TRX_UNDO_HISTORY_NODE, mtr); - mutex_enter(&kernel_mutex); - trx_sys->rseg_history_len++; - mutex_exit(&kernel_mutex); - - if (!(trx_sys->rseg_history_len % srv_purge_batch_size)) { - /* Inform the purge thread that there is work to do. */ - srv_wake_purge_thread_if_not_active(); - } + flst_add_first( + rseg_header + TRX_RSEG_HISTORY, + undo_header + TRX_UNDO_HISTORY_NODE, mtr); /* Write the trx number to the undo log header */ + mlog_write_ull(undo_header + TRX_UNDO_TRX_NO, trx->no, mtr); + /* Write information about delete markings to the undo log header */ if (!undo->del_marks) { - mlog_write_ulint(undo_header + TRX_UNDO_DEL_MARKS, FALSE, - MLOG_2BYTES, mtr); + mlog_write_ulint( + undo_header + TRX_UNDO_DEL_MARKS, FALSE, + MLOG_2BYTES, mtr); } - if (rseg->last_page_no == FIL_NULL) { + if (undo->rseg->last_page_no == FIL_NULL) { + undo->rseg->last_trx_no = trx->no; + undo->rseg->last_offset = undo->hdr_offset; + undo->rseg->last_page_no = undo->hdr_page_no; + undo->rseg->last_del_marks = undo->del_marks; - rseg->last_page_no = undo->hdr_page_no; - rseg->last_offset = undo->hdr_offset; - rseg->last_trx_no = trx->no; - rseg->last_del_marks = undo->del_marks; + /* FIXME: Add a bin heap validate function to check that + the rseg exists. */ + } + + mutex_enter(&kernel_mutex); + trx_sys->rseg_history_len++; + mutex_exit(&kernel_mutex); + + if (!(trx_sys->rseg_history_len % srv_purge_batch_size)) { + /* Inform the purge thread that there is work to do. */ + srv_wake_purge_thread_if_not_active(); } } @@ -411,7 +423,6 @@ trx_purge_free_segment( /* fputs("Freeing an update undo log segment\n", stderr); */ - ut_ad(mutex_own(&(purge_sys->mutex))); loop: mtr_start(&mtr); mutex_enter(&(rseg->mutex)); @@ -515,8 +526,6 @@ trx_purge_truncate_rseg_history( mtr_t mtr; trx_id_t undo_trx_no; - ut_ad(mutex_own(&(purge_sys->mutex))); - mtr_start(&mtr); mutex_enter(&(rseg->mutex)); @@ -609,10 +618,8 @@ trx_purge_truncate_history(void) trx_id_t limit_trx_no; undo_no_t limit_undo_no; - ut_ad(mutex_own(&(purge_sys->mutex))); - - trx_purge_arr_get_biggest(purge_sys->arr, &limit_trx_no, - &limit_undo_no); + trx_purge_arr_get_biggest( + purge_sys->arr, &limit_trx_no, &limit_undo_no); if (limit_trx_no == 0) { @@ -630,34 +637,29 @@ trx_purge_truncate_history(void) ut_ad(limit_trx_no <= purge_sys->view->low_limit_no); - rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list); + for (rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list); + rseg != NULL; + rseg = UT_LIST_GET_NEXT(rseg_list, rseg)) { - while (rseg) { - trx_purge_truncate_rseg_history(rseg, limit_trx_no, - limit_undo_no); - rseg = UT_LIST_GET_NEXT(rseg_list, rseg); + trx_purge_truncate_rseg_history( + rseg, limit_trx_no, limit_undo_no); } } /********************************************************************//** Does a truncate if the purge array is empty. NOTE that when this function is -called, the caller must not have any latches on undo log pages! -@return TRUE if array empty */ +called, the caller must not have any latches on undo log pages! */ UNIV_INLINE -ibool +void trx_purge_truncate_if_arr_empty(void) /*=================================*/ { - ut_ad(mutex_own(&(purge_sys->mutex))); + static ulint count; - if (purge_sys->arr->n_used == 0) { + if (!(++count % TRX_SYS_N_RSEGS) && purge_sys->arr->n_used == 0) { trx_purge_truncate_history(); - - return(TRUE); } - - return(FALSE); } /***********************************************************************//** @@ -675,8 +677,8 @@ trx_purge_rseg_get_next_history_log( trx_id_t trx_no; ibool del_marks; mtr_t mtr; - - ut_ad(mutex_own(&(purge_sys->mutex))); + rseg_queue_t rseg_queue; + const void* ptr; mutex_enter(&(rseg->mutex)); @@ -688,8 +690,9 @@ trx_purge_rseg_get_next_history_log( mtr_start(&mtr); - undo_page = trx_undo_page_get_s_latched(rseg->space, rseg->zip_size, - rseg->last_page_no, &mtr); + undo_page = trx_undo_page_get_s_latched( + rseg->space, rseg->zip_size, rseg->last_page_no, &mtr); + log_hdr = undo_page + rseg->last_offset; /* Increase the purge page count by one for every handled log */ @@ -698,6 +701,7 @@ trx_purge_rseg_get_next_history_log( prev_log_addr = trx_purge_get_log_from_hist( flst_get_prev_addr(log_hdr + TRX_UNDO_HISTORY_NODE, &mtr)); + if (prev_log_addr.page == FIL_NULL) { /* No logs left in the history list */ @@ -712,11 +716,11 @@ trx_purge_rseg_get_next_history_log( on the MySQL mailing list on Nov 9, 2004. The fut0lst.c file-based list was corrupt. The prev node pointer was FIL_NULL, even though the list length was over 8 million nodes! - We assume that purge truncates the history list in moderate + We assume that purge truncates the history list in large size pieces, and if we here reach the head of the list, the - list cannot be longer than 20 000 undo logs now. */ + list cannot be longer than 2000 000 undo logs now. */ - if (trx_sys->rseg_history_len > 20000) { + if (trx_sys->rseg_history_len > 2000000) { ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: Warning: purge reached the" @@ -756,105 +760,150 @@ trx_purge_rseg_get_next_history_log( rseg->last_trx_no = trx_no; rseg->last_del_marks = del_marks; + rseg_queue.rseg = rseg; + rseg_queue.trx_no = rseg->last_trx_no; + + /* Purge can also produce events, however these are already ordered + in the rollback segment and any user generated event will be greater + than the events that Purge produces. ie. Purge can never produce + events from an empty rollback segment. */ + + mutex_enter(&purge_sys->bh_mutex); + + ptr = ib_bh_push(purge_sys->ib_bh, &rseg_queue); + ut_a(ptr != NULL); + + mutex_exit(&purge_sys->bh_mutex); + mutex_exit(&(rseg->mutex)); } /***********************************************************************//** -Chooses the next undo log to purge and updates the info in purge_sys. This -function is used to initialize purge_sys when the next record to purge is -not known, and also to update the purge system info on the next record when -purge has handled the whole undo log for a transaction. */ +Chooses the rollback segment with the smallest trx_id. +@return zip_size if log is for a compressed table, ULINT_UNDEFINED if + no rollback segments to purge, 0 for non compressed tables. */ static -void -trx_purge_choose_next_log(void) -/*===========================*/ +ulint +trx_purge_get_rseg_with_min_trx_id( +/*===============================*/ + trx_purge_t* purge_sys) /*!< in/out: purge instance */ + { - trx_undo_rec_t* rec; - trx_rseg_t* rseg; - trx_rseg_t* min_rseg; - trx_id_t min_trx_no; - ulint space = 0; /* remove warning (??? bug ???) */ ulint zip_size = 0; - ulint page_no = 0; /* remove warning (??? bug ???) */ - ulint offset = 0; /* remove warning (??? bug ???) */ - mtr_t mtr; - ut_ad(mutex_own(&(purge_sys->mutex))); - ut_ad(purge_sys->next_stored == FALSE); + mutex_enter(&purge_sys->bh_mutex); - rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list); + /* Only purge consumes events from the binary heap, user + threads only produce the events. */ - min_trx_no = IB_ULONGLONG_MAX; + if (!ib_bh_is_empty(purge_sys->ib_bh)) { + trx_rseg_t* rseg; - min_rseg = NULL; + rseg = ((rseg_queue_t*) ib_bh_first(purge_sys->ib_bh))->rseg; + ib_bh_pop(purge_sys->ib_bh); - while (rseg) { - mutex_enter(&(rseg->mutex)); + mutex_exit(&purge_sys->bh_mutex); - if (rseg->last_page_no != FIL_NULL) { + purge_sys->rseg = rseg; + } else { + mutex_exit(&purge_sys->bh_mutex); - if (min_rseg == NULL - || min_trx_no > rseg->last_trx_no) { + purge_sys->rseg = NULL; - min_rseg = rseg; - min_trx_no = rseg->last_trx_no; - space = rseg->space; - zip_size = rseg->zip_size; - ut_a(space == 0); /* We assume in purge of - externally stored fields - that space id == 0 */ - page_no = rseg->last_page_no; - offset = rseg->last_offset; - } - } + return(ULINT_UNDEFINED); + } - mutex_exit(&(rseg->mutex)); + ut_a(purge_sys->rseg != NULL); - rseg = UT_LIST_GET_NEXT(rseg_list, rseg); - } + mutex_enter(&purge_sys->rseg->mutex); - if (min_rseg == NULL) { + ut_a(purge_sys->rseg->last_page_no != FIL_NULL); - return; - } + /* We assume in purge of externally stored fields + that space id == 0 */ + ut_a(purge_sys->rseg->space == 0); - mtr_start(&mtr); + zip_size = purge_sys->rseg->zip_size; - if (!min_rseg->last_del_marks) { - /* No need to purge this log */ + ut_a(purge_sys->purge_trx_no <= purge_sys->rseg->last_trx_no); - rec = &trx_purge_dummy_rec; - } else { - rec = trx_undo_get_first_rec(space, zip_size, page_no, offset, - RW_S_LATCH, &mtr); - if (rec == NULL) { - /* Undo log empty */ + purge_sys->purge_trx_no = purge_sys->rseg->last_trx_no; + + purge_sys->hdr_offset = purge_sys->rseg->last_offset; + + purge_sys->hdr_page_no = purge_sys->rseg->last_page_no; + + mutex_exit(&purge_sys->rseg->mutex); + + return(zip_size); +} + +/***********************************************************************//** +Position the purge sys "iterator" on the undo record to use for purging. */ +static +void +trx_purge_read_undo_rec( +/*====================*/ + trx_purge_t* purge_sys, /*!< in/out: purge instance */ + ulint zip_size) /*!< in: block size or 0 */ +{ + ulint page_no; + ulint offset = 0; + ib_uint64_t undo_no = 0; + + purge_sys->hdr_offset = purge_sys->rseg->last_offset; + page_no = purge_sys->hdr_page_no = purge_sys->rseg->last_page_no; + + if (purge_sys->rseg->last_del_marks) { + mtr_t mtr; + trx_undo_rec_t* undo_rec; - rec = &trx_purge_dummy_rec; + mtr_start(&mtr); + + undo_rec = trx_undo_get_first_rec( + 0 /* System space id */, zip_size, + purge_sys->hdr_page_no, + purge_sys->hdr_offset, RW_S_LATCH, &mtr); + + if (undo_rec != NULL) { + offset = page_offset(undo_rec); + undo_no = trx_undo_rec_get_undo_no(undo_rec); + page_no = page_get_page_no(page_align(undo_rec)); } + + mtr_commit(&mtr); } + purge_sys->offset = offset; + purge_sys->page_no = page_no; + purge_sys->purge_undo_no = undo_no; + purge_sys->next_stored = TRUE; - purge_sys->rseg = min_rseg; +} - purge_sys->hdr_page_no = page_no; - purge_sys->hdr_offset = offset; +/***********************************************************************//** +Chooses the next undo log to purge and updates the info in purge_sys. This +function is used to initialize purge_sys when the next record to purge is +not known, and also to update the purge system info on the next record when +purge has handled the whole undo log for a transaction. */ +static +void +trx_purge_choose_next_log(void) +/*===========================*/ +{ + ulint zip_size; - purge_sys->purge_trx_no = min_trx_no; + ut_ad(purge_sys->next_stored == FALSE); - if (rec == &trx_purge_dummy_rec) { + zip_size = trx_purge_get_rseg_with_min_trx_id(purge_sys); - purge_sys->purge_undo_no = 0; - purge_sys->page_no = page_no; - purge_sys->offset = 0; - } else { - purge_sys->purge_undo_no = trx_undo_rec_get_undo_no(rec); + if (purge_sys->rseg != NULL) { - purge_sys->page_no = page_get_page_no(page_align(rec)); - purge_sys->offset = page_offset(rec); + trx_purge_read_undo_rec(purge_sys, zip_size); + } else { + /* There is nothing to do yet. */ + os_thread_yield(); } - - mtr_commit(&mtr); } /***********************************************************************//** @@ -880,7 +929,6 @@ trx_purge_get_next_rec( ulint cmpl_info; mtr_t mtr; - ut_ad(mutex_own(&(purge_sys->mutex))); ut_ad(purge_sys->next_stored); space = purge_sys->rseg->space; @@ -903,8 +951,8 @@ trx_purge_get_next_rec( mtr_start(&mtr); - undo_page = trx_undo_page_get_s_latched(space, zip_size, - page_no, &mtr); + undo_page = trx_undo_page_get_s_latched(space, zip_size, page_no, &mtr); + rec = undo_page + offset; rec2 = rec; @@ -913,9 +961,9 @@ trx_purge_get_next_rec( /* Try first to find the next record which requires a purge operation from the same page of the same undo log */ - next_rec = trx_undo_page_get_next_rec(rec2, - purge_sys->hdr_page_no, - purge_sys->hdr_offset); + next_rec = trx_undo_page_get_next_rec( + rec2, purge_sys->hdr_page_no, purge_sys->hdr_offset); + if (next_rec == NULL) { rec2 = trx_undo_get_next_rec( rec2, purge_sys->hdr_page_no, @@ -995,17 +1043,12 @@ trx_purge_fetch_next_rec( { trx_undo_rec_t* undo_rec; - mutex_enter(&(purge_sys->mutex)); if (purge_sys->state == TRX_STOP_PURGE) { trx_purge_truncate_if_arr_empty(); - mutex_exit(&(purge_sys->mutex)); - return(NULL); - } - - if (!purge_sys->next_stored) { + } else if (!purge_sys->next_stored) { trx_purge_choose_next_log(); if (!purge_sys->next_stored) { @@ -1020,8 +1063,6 @@ trx_purge_fetch_next_rec( (ulong) purge_sys->n_pages_handled); } - mutex_exit(&(purge_sys->mutex)); - return(NULL); } } @@ -1032,18 +1073,12 @@ trx_purge_fetch_next_rec( trx_purge_truncate_if_arr_empty(); - mutex_exit(&(purge_sys->mutex)); - return(NULL); - } - - if (purge_sys->purge_trx_no >= purge_sys->view->low_limit_no) { + } else if (purge_sys->purge_trx_no >= purge_sys->view->low_limit_no) { purge_sys->state = TRX_STOP_PURGE; trx_purge_truncate_if_arr_empty(); - mutex_exit(&(purge_sys->mutex)); - return(NULL); } @@ -1052,12 +1087,13 @@ trx_purge_fetch_next_rec( (ullint) purge_sys->purge_trx_no, (ullint) purge_sys->purge_undo_no); */ - *roll_ptr = trx_undo_build_roll_ptr(FALSE, (purge_sys->rseg)->id, - purge_sys->page_no, - purge_sys->offset); - *cell = trx_purge_arr_store_info(purge_sys->purge_trx_no, - purge_sys->purge_undo_no); + *roll_ptr = trx_undo_build_roll_ptr( + FALSE, (purge_sys->rseg)->id, purge_sys->page_no, + purge_sys->offset); + + *cell = trx_purge_arr_store_info( + purge_sys->purge_trx_no, purge_sys->purge_undo_no); ut_ad(purge_sys->purge_trx_no < purge_sys->view->low_limit_no); @@ -1066,8 +1102,6 @@ trx_purge_fetch_next_rec( undo_rec = trx_purge_get_next_rec(heap); - mutex_exit(&(purge_sys->mutex)); - return(undo_rec); } @@ -1079,11 +1113,7 @@ trx_purge_rec_release( /*==================*/ trx_undo_inf_t* cell) /*!< in: storage cell */ { - mutex_enter(&(purge_sys->mutex)); - trx_purge_arr_remove_info(cell); - - mutex_exit(&(purge_sys->mutex)); } /*******************************************************************//** @@ -1097,23 +1127,11 @@ trx_purge( purge in one batch */ { que_thr_t* thr; - /* que_thr_t* thr2; */ ulint old_pages_handled; - mutex_enter(&(purge_sys->mutex)); - - if (purge_sys->trx->n_active_thrs > 0) { + ut_a(purge_sys->trx->n_active_thrs == 0); - mutex_exit(&(purge_sys->mutex)); - - /* Should not happen */ - - ut_error; - - return(0); - } - - rw_lock_x_lock(&(purge_sys->latch)); + rw_lock_x_lock(&purge_sys->latch); mutex_enter(&kernel_mutex); @@ -1147,8 +1165,9 @@ trx_purge( } } - purge_sys->view = read_view_oldest_copy_or_open_new(0, - purge_sys->heap); + purge_sys->view = read_view_oldest_copy_or_open_new( + 0, purge_sys->heap); + mutex_exit(&kernel_mutex); rw_lock_x_unlock(&(purge_sys->latch)); @@ -1159,7 +1178,6 @@ trx_purge( old_pages_handled = purge_sys->n_pages_handled; - mutex_exit(&(purge_sys->mutex)); mutex_enter(&kernel_mutex); @@ -1167,15 +1185,8 @@ trx_purge( ut_ad(thr); - /* thr2 = que_fork_start_command(purge_sys->query); - - ut_ad(thr2); */ - - mutex_exit(&kernel_mutex); - /* srv_que_task_enqueue(thr2); */ - if (srv_print_thread_releases) { fputs("Starting purge\n", stderr); diff --git a/storage/innobase/trx/trx0rseg.c b/storage/innobase/trx/trx0rseg.c index 740320f68c1..85beac8afbc 100644 --- a/storage/innobase/trx/trx0rseg.c +++ b/storage/innobase/trx/trx0rseg.c @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved. +Copyright (c) 1996, 2011, Oracle Corpn. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -50,11 +50,11 @@ trx_rseg_get_on_id( { trx_rseg_t* rseg; - rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list); + ut_a(id < TRX_SYS_N_RSEGS); - while (rseg && rseg->id != id) { - rseg = UT_LIST_GET_NEXT(rseg_list, rseg); - } + rseg = trx_sys->rseg_array[id]; + + ut_a(rseg == NULL || id == rseg->id); return(rseg); } @@ -181,12 +181,15 @@ static trx_rseg_t* trx_rseg_mem_create( /*================*/ - ulint id, /*!< in: rollback segment id */ - ulint space, /*!< in: space where the segment placed */ - ulint zip_size, /*!< in: compressed page size in bytes - or 0 for uncompressed pages */ - ulint page_no, /*!< in: page number of the segment header */ - mtr_t* mtr) /*!< in: mtr */ + ulint id, /*!< in: rollback segment id */ + ulint space, /*!< in: space where the segment + placed */ + ulint zip_size, /*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint page_no, /*!< in: page number of the segment + header */ + ib_bh_t* ib_bh, /*!< in/out: rseg queue */ + mtr_t* mtr) /*!< in: mtr */ { ulint len; trx_rseg_t* rseg; @@ -225,6 +228,9 @@ trx_rseg_mem_create( len = flst_get_len(rseg_header + TRX_RSEG_HISTORY, mtr); if (len > 0) { + const void* ptr; + rseg_queue_t rseg_queue; + trx_sys->rseg_history_len += len; node_addr = trx_purge_get_log_from_hist( @@ -240,6 +246,17 @@ trx_rseg_mem_create( undo_log_hdr + TRX_UNDO_TRX_NO); rseg->last_del_marks = mtr_read_ulint( undo_log_hdr + TRX_UNDO_DEL_MARKS, MLOG_2BYTES, mtr); + + rseg_queue.rseg = rseg; + rseg_queue.trx_no = rseg->last_trx_no; + + if (rseg->last_page_no != FIL_NULL) { + /* There is no need to cover this operation by the purge + mutex because we are still bootstrapping. */ + + ptr = ib_bh_push(ib_bh, &rseg_queue); + ut_a(ptr != NULL); + } } else { rseg->last_page_no = FIL_NULL; } @@ -255,6 +272,7 @@ void trx_rseg_create_instance( /*=====================*/ trx_sysf_t* sys_header, /*!< in: trx system header */ + ib_bh_t* ib_bh, /*!< in/out: rseg queue */ mtr_t* mtr) /*!< in: mtr */ { ulint i; @@ -278,7 +296,7 @@ trx_rseg_create_instance( zip_size = space ? fil_space_get_zip_size(space) : 0; rseg = trx_rseg_mem_create( - i, space, zip_size, page_no, mtr); + i, space, zip_size, page_no, ib_bh, mtr); ut_a(rseg->id == i); } @@ -327,7 +345,8 @@ trx_rseg_create(void) zip_size = space ? fil_space_get_zip_size(space) : 0; rseg = trx_rseg_mem_create( - slot_no, space, zip_size, page_no, &mtr); + slot_no, space, zip_size, page_no, + purge_sys->ib_bh, &mtr); } mutex_exit(&kernel_mutex); @@ -342,13 +361,14 @@ UNIV_INTERN void trx_rseg_list_and_array_init( /*=========================*/ - trx_sysf_t* sys_header, /* in: trx system header */ - mtr_t* mtr) /* in: mtr */ + trx_sysf_t* sys_header, /*!< in: trx system header */ + ib_bh_t* ib_bh, /*!< in: rseg queue */ + mtr_t* mtr) /*!< in: mtr */ { UT_LIST_INIT(trx_sys->rseg_list); trx_sys->rseg_history_len = 0; - trx_rseg_create_instance(sys_header, mtr); + trx_rseg_create_instance(sys_header, ib_bh, mtr); } diff --git a/storage/innobase/trx/trx0sys.c b/storage/innobase/trx/trx0sys.c index 101f225a06f..7af9fbb1af8 100644 --- a/storage/innobase/trx/trx0sys.c +++ b/storage/innobase/trx/trx0sys.c @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved. +Copyright (c) 1996, 2011, Innobase Oy. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -946,6 +946,31 @@ trx_sysf_create( } /*****************************************************************//** +Compare two trx_rseg_t instances on last_trx_no. */ +static +int +trx_rseg_compare_last_trx_no( +/*=========================*/ + const void* p1, /*!< in: elem to compare */ + const void* p2) /*!< in: elem to compare */ +{ + ib_int64_t cmp; + + const rseg_queue_t* rseg_q1 = (const rseg_queue_t*) p1; + const rseg_queue_t* rseg_q2 = (const rseg_queue_t*) p2; + + cmp = rseg_q1->trx_no - rseg_q2->trx_no; + + if (cmp < 0) { + return(-1); + } else if (cmp > 0) { + return(1); + } + + return(0); +} + +/*****************************************************************//** Creates and initializes the central memory structures for the transaction system. This is called when the database is started. */ UNIV_INTERN @@ -958,6 +983,7 @@ trx_sys_init_at_db_start(void) const char* unit = ""; trx_t* trx; mtr_t mtr; + ib_bh_t* ib_bh; mtr_start(&mtr); @@ -965,11 +991,19 @@ trx_sys_init_at_db_start(void) mutex_enter(&kernel_mutex); - trx_sys = mem_alloc(sizeof(trx_sys_t)); + /* We create the min binary heap here and pass ownership to + purge when we init the purge sub-system. Purge is responsible + for freeing the binary heap. */ + + ib_bh = ib_bh_create( + trx_rseg_compare_last_trx_no, + sizeof(rseg_queue_t), TRX_SYS_N_RSEGS); + + trx_sys = mem_zalloc(sizeof(*trx_sys)); sys_header = trx_sysf_get(&mtr); - trx_rseg_list_and_array_init(sys_header, &mtr); + trx_rseg_list_and_array_init(sys_header, ib_bh, &mtr); trx_sys->latest_rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list); @@ -1023,7 +1057,8 @@ trx_sys_init_at_db_start(void) UT_LIST_INIT(trx_sys->view_list); - trx_purge_sys_create(); + /* Transfer ownership to purge. */ + trx_purge_sys_create(ib_bh); mutex_exit(&kernel_mutex); diff --git a/storage/innobase/trx/trx0trx.c b/storage/innobase/trx/trx0trx.c index 4bbcee210b0..820c40fe857 100644 --- a/storage/innobase/trx/trx0trx.c +++ b/storage/innobase/trx/trx0trx.c @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved. +Copyright (c) 1996, 2011, Innobase Oy. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -42,6 +42,7 @@ Created 3/26/1996 Heikki Tuuri #include "btr0sea.h" #include "os0proc.h" #include "trx0xa.h" +#include "trx0purge.h" #include "ha_prototypes.h" /** Dummy session used currently in MySQL interface */ @@ -604,36 +605,26 @@ trx_lists_init_at_db_start(void) /******************************************************************//** Assigns a rollback segment to a transaction in a round-robin fashion. -Skips the SYSTEM rollback segment if another is available. -@return assigned rollback segment id */ +@return assigned rollback segment instance */ UNIV_INLINE -ulint -trx_assign_rseg(void) -/*=================*/ +trx_rseg_t* +trx_assign_rseg( +/*============*/ + ulint max_undo_logs) /*!< in: maximum number of UNDO logs to use */ { - trx_rseg_t* rseg = trx_sys->latest_rseg; + trx_rseg_t* rseg = trx_sys->latest_rseg; ut_ad(mutex_own(&kernel_mutex)); -loop: - /* Get next rseg in a round-robin fashion */ rseg = UT_LIST_GET_NEXT(rseg_list, rseg); - if (rseg == NULL) { + if (rseg == NULL || rseg->id == max_undo_logs - 1) { rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list); } - /* If it is the SYSTEM rollback segment, and there exist others, skip - it */ - - if ((rseg->id == TRX_SYS_SYSTEM_RSEG_ID) - && (UT_LIST_GET_LEN(trx_sys->rseg_list) > 1)) { - goto loop; - } - trx_sys->latest_rseg = rseg; - return(rseg->id); + return(rseg); } /****************************************************************//** @@ -663,12 +654,9 @@ trx_start_low( ut_ad(trx->conc_state != TRX_ACTIVE); - if (rseg_id == ULINT_UNDEFINED) { - - rseg_id = trx_assign_rseg(); - } + ut_a(rseg_id == ULINT_UNDEFINED); - rseg = trx_sys_get_nth_rseg(trx_sys, rseg_id); + rseg = trx_assign_rseg(srv_rollback_segments); trx->id = trx_sys_get_new_trx_id(); @@ -719,107 +707,179 @@ trx_start( } /****************************************************************//** -Commits a transaction. */ -UNIV_INTERN +Set the transaction serialisation number. */ +static void -trx_commit_off_kernel( -/*==================*/ - trx_t* trx) /*!< in: transaction */ +trx_serialisation_number_get( +/*=========================*/ + trx_t* trx) /*!< in: transaction */ { - page_t* update_hdr_page; - ib_uint64_t lsn = 0; trx_rseg_t* rseg; - trx_undo_t* undo; - mtr_t mtr; - ut_ad(mutex_own(&kernel_mutex)); + rseg = trx->rseg; - trx->must_flush_log_later = FALSE; + ut_ad(mutex_own(&rseg->mutex)); - rseg = trx->rseg; + mutex_enter(&kernel_mutex); - if (trx->insert_undo != NULL || trx->update_undo != NULL) { + trx->no = trx_sys_get_new_trx_id(); + + /* If the rollack segment is not empty then the + new trx_t::no can't be less than any trx_t::no + already in the rollback segment. User threads only + produce events when a rollback segment is empty. */ + + if (rseg->last_page_no == FIL_NULL) { + void* ptr; + rseg_queue_t rseg_queue; + + rseg_queue.rseg = rseg; + rseg_queue.trx_no = trx->no; + + mutex_enter(&purge_sys->bh_mutex); + + /* This is to reduce the pressure on the kernel mutex, + though in reality it should make very little (read no) + difference because this code path is only taken when the + rbs is empty. */ mutex_exit(&kernel_mutex); - mtr_start(&mtr); + ptr = ib_bh_push(purge_sys->ib_bh, &rseg_queue); + ut_a(ptr); - /* Change the undo log segment states from TRX_UNDO_ACTIVE - to some other state: these modifications to the file data - structure define the transaction as committed in the file - based world, at the serialization point of the log sequence - number lsn obtained below. */ + mutex_exit(&purge_sys->bh_mutex); + } else { + mutex_exit(&kernel_mutex); + } +} - mutex_enter(&(rseg->mutex)); +/****************************************************************//** +Assign the transaction its history serialisation number and write the +update UNDO log record to the assigned rollback segment. +@return the LSN of the UNDO log write. */ +static +ib_uint64_t +trx_write_serialisation_history( +/*============================*/ + trx_t* trx) /*!< in: transaction */ +{ + mtr_t mtr; + trx_rseg_t* rseg; - if (trx->insert_undo != NULL) { - trx_undo_set_state_at_finish(trx->insert_undo, &mtr); - } + ut_ad(!mutex_own(&kernel_mutex)); - undo = trx->update_undo; + rseg = trx->rseg; - if (undo) { - mutex_enter(&kernel_mutex); - trx->no = trx_sys_get_new_trx_no(); - mutex_exit(&kernel_mutex); + mtr_start(&mtr); - /* It is not necessary to obtain trx->undo_mutex here - because only a single OS thread is allowed to do the - transaction commit for this transaction. */ + /* Change the undo log segment states from TRX_UNDO_ACTIVE + to some other state: these modifications to the file data + structure define the transaction as committed in the file + based domain, at the serialization point of the log sequence + number lsn obtained below. */ - update_hdr_page = trx_undo_set_state_at_finish( - undo, &mtr); + if (trx->update_undo != NULL) { + page_t* undo_hdr_page; + trx_undo_t* undo = trx->update_undo; - /* We have to do the cleanup for the update log while - holding the rseg mutex because update log headers - have to be put to the history list in the order of - the trx number. */ + /* We have to hold the rseg mutex because update + log headers have to be put to the history list in the + (serialisation) order of the UNDO trx number. This is + required for the purge in-memory data structures too. */ - trx_undo_update_cleanup(trx, update_hdr_page, &mtr); - } + mutex_enter(&rseg->mutex); - mutex_exit(&(rseg->mutex)); + /* Assign the transaction serialisation number and also + update the purge min binary heap if this is the first + UNDO log being written to the assigned rollback segment. */ - /* Update the latest MySQL binlog name and offset info - in trx sys header if MySQL binlogging is on or the database - server is a MySQL replication slave */ - - if (trx->mysql_log_file_name - && trx->mysql_log_file_name[0] != '\0') { - trx_sys_update_mysql_binlog_offset( - trx->mysql_log_file_name, - trx->mysql_log_offset, - TRX_SYS_MYSQL_LOG_INFO, &mtr); - trx->mysql_log_file_name = NULL; - } + trx_serialisation_number_get(trx); - /* The following call commits the mini-transaction, making the - whole transaction committed in the file-based world, at this - log sequence number. The transaction becomes 'durable' when - we write the log to disk, but in the logical sense the commit - in the file-based data structures (undo logs etc.) happens - here. - - NOTE that transaction numbers, which are assigned only to - transactions with an update undo log, do not necessarily come - in exactly the same order as commit lsn's, if the transactions - have different rollback segments. To get exactly the same - order we should hold the kernel mutex up to this point, - adding to the contention of the kernel mutex. However, if - a transaction T2 is able to see modifications made by - a transaction T1, T2 will always get a bigger transaction - number and a bigger commit lsn than T1. */ + /* It is not necessary to obtain trx->undo_mutex here + because only a single OS thread is allowed to do the + transaction commit for this transaction. */ - /*--------------*/ - mtr_commit(&mtr); - /*--------------*/ - lsn = mtr.end_lsn; + undo_hdr_page = trx_undo_set_state_at_finish(undo, &mtr); + + trx_undo_update_cleanup(trx, undo_hdr_page, &mtr); + } else { + mutex_enter(&rseg->mutex); + } + + if (trx->insert_undo != NULL) { + trx_undo_set_state_at_finish(trx->insert_undo, &mtr); + } + + mutex_exit(&rseg->mutex); + + /* Update the latest MySQL binlog name and offset info + in trx sys header if MySQL binlogging is on or the database + server is a MySQL replication slave */ + + if (trx->mysql_log_file_name + && trx->mysql_log_file_name[0] != '\0') { + + trx_sys_update_mysql_binlog_offset( + trx->mysql_log_file_name, + trx->mysql_log_offset, + TRX_SYS_MYSQL_LOG_INFO, &mtr); + + trx->mysql_log_file_name = NULL; + } + + /* The following call commits the mini-transaction, making the + whole transaction committed in the file-based world, at this + log sequence number. The transaction becomes 'durable' when + we write the log to disk, but in the logical sense the commit + in the file-based data structures (undo logs etc.) happens + here. + + NOTE that transaction numbers, which are assigned only to + transactions with an update undo log, do not necessarily come + in exactly the same order as commit lsn's, if the transactions + have different rollback segments. To get exactly the same + order we should hold the kernel mutex up to this point, + adding to the contention of the kernel mutex. However, if + a transaction T2 is able to see modifications made by + a transaction T1, T2 will always get a bigger transaction + number and a bigger commit lsn than T1. */ + + /*--------------*/ + mtr_commit(&mtr); + /*--------------*/ + + return(mtr.end_lsn); +} + +/****************************************************************//** +Commits a transaction. */ +UNIV_INTERN +void +trx_commit_off_kernel( +/*==================*/ + trx_t* trx) /*!< in: transaction */ +{ + ib_uint64_t lsn; + + ut_ad(mutex_own(&kernel_mutex)); + + trx->must_flush_log_later = FALSE; + + /* If the transaction made any updates then we need to write the + UNDO logs for the updates to the assigned rollback segment. */ + + if (trx->insert_undo != NULL || trx->update_undo != NULL) { + mutex_exit(&kernel_mutex); + + lsn = trx_write_serialisation_history(trx); mutex_enter(&kernel_mutex); + } else { + lsn = 0; } - ut_ad(trx->conc_state == TRX_ACTIVE - || trx->conc_state == TRX_PREPARED); + ut_ad(trx->conc_state == TRX_ACTIVE || trx->conc_state == TRX_PREPARED); ut_ad(mutex_own(&kernel_mutex)); /* The following assignment makes the transaction committed in memory diff --git a/storage/innobase/ut/ut0bh.c b/storage/innobase/ut/ut0bh.c new file mode 100644 index 00000000000..ae0b1aff207 --- /dev/null +++ b/storage/innobase/ut/ut0bh.c @@ -0,0 +1,164 @@ +/***************************************************************************//** +Copyright (c) 2010, 2011, Oracle Corpn. All Rights Reserved. + +Portions of this file contain modifications contributed and copyrighted by +Sun Microsystems, Inc. Those modifications are gratefully acknowledged and +are described briefly in the InnoDB documentation. The contributions by +Sun Microsystems are incorporated with their permission, and subject to the +conditions contained in the file COPYING.Sun_Microsystems. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/******************************************************************//** +@file ut/ut0bh.c +Binary min-heap implementation. + +Created 2010-05-28 by Sunny Bains +*******************************************************/ + +#include "ut0bh.h" +#include "ut0mem.h" + +#ifdef UNIV_NONINL +#include "ut0bh.ic" +#endif + +#include <string.h> + +/**********************************************************************//** +Create a binary heap. +@return a new binary heap */ +UNIV_INTERN +ib_bh_t* +ib_bh_create( +/*=========*/ + ib_bh_cmp_t compare, /*!< in: comparator */ + ulint sizeof_elem, /*!< in: size of one element */ + ulint max_elems) /*!< in: max elements allowed */ +{ + ulint sz; + ib_bh_t* ib_bh; + + sz = sizeof(*ib_bh) + (sizeof_elem * max_elems); + + ib_bh = (ib_bh_t*) ut_malloc(sz); + memset(ib_bh, 0x0, sz); + + ib_bh->compare = compare; + ib_bh->max_elems = max_elems; + ib_bh->sizeof_elem = sizeof_elem; + + return(ib_bh); +} + +/**********************************************************************//** +Free a binary heap. +@return a new binary heap */ +UNIV_INTERN +void +ib_bh_free( +/*=======*/ + ib_bh_t* ib_bh) /*!< in/own: instance */ +{ + ut_free(ib_bh); +} + +/**********************************************************************//** +Add an element to the binary heap. Note: The element is copied. +@return pointer to added element or NULL if full. */ +UNIV_INTERN +void* +ib_bh_push( +/*=======*/ + ib_bh_t* ib_bh, /*!< in/out: instance */ + const void* elem) /*!< in: element to add */ +{ + void* ptr; + + if (ib_bh_is_full(ib_bh)) { + return(NULL); + } else if (ib_bh_is_empty(ib_bh)) { + ++ib_bh->n_elems; + return(ib_bh_set(ib_bh, 0, elem)); + } else { + ulint i; + + i = ib_bh->n_elems; + + ++ib_bh->n_elems; + + for (ptr = ib_bh_get(ib_bh, i >> 1); + i > 0 && ib_bh->compare(ptr, elem) > 0; + i >>= 1, ptr = ib_bh_get(ib_bh, i >> 1)) { + + ib_bh_set(ib_bh, i, ptr); + } + + ptr = ib_bh_set(ib_bh, i, elem); + } + + return(ptr); +} + +/**********************************************************************//** +Remove the first element from the binary heap. */ +UNIV_INTERN +void +ib_bh_pop( +/*======*/ + ib_bh_t* ib_bh) /*!< in/out: instance */ +{ + byte* ptr; + byte* last; + ulint parent = 0; + + if (ib_bh_is_empty(ib_bh)) { + return; + } else if (ib_bh_size(ib_bh) == 1) { + --ib_bh->n_elems; + return; + } + + last = (byte*) ib_bh_last(ib_bh); + + /* Start from the child node */ + ptr = (byte*) ib_bh_get(ib_bh, 1); + + while (ptr < last) { + /* If the "right" child node is < "left" child node */ + if (ib_bh->compare(ptr + ib_bh->sizeof_elem, ptr) < 0) { + ptr += ib_bh->sizeof_elem; + } + + if (ib_bh->compare(last, ptr) <= 0) { + break; + } + + ib_bh_set(ib_bh, parent, ptr); + + parent = (ptr - (byte*) ib_bh_first(ib_bh)) + / ib_bh->sizeof_elem; + + if ((parent << 1) >= ib_bh_size(ib_bh)) { + break; + } + + ptr = (byte*) ib_bh_get(ib_bh, parent << 1); + } + + --ib_bh->n_elems; + + ib_bh_set(ib_bh, parent, last); +} diff --git a/storage/innobase/ut/ut0ut.c b/storage/innobase/ut/ut0ut.c index 4f4dfc5eed8..cd0894b132a 100644 --- a/storage/innobase/ut/ut0ut.c +++ b/storage/innobase/ut/ut0ut.c @@ -1,13 +1,6 @@ /***************************************************************************** -Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved. -Copyright (c) 2009, Sun Microsystems, Inc. - -Portions of this file contain modifications contributed and copyrighted by -Sun Microsystems, Inc. Those modifications are gratefully acknowledged and -are described briefly in the InnoDB documentation. The contributions by -Sun Microsystems are incorporated with their permission, and subject to the -conditions contained in the file COPYING.Sun_Microsystems. +Copyright (c) 2011, Oracle Corpn. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software diff --git a/storage/myisam/mi_create.c b/storage/myisam/mi_create.c index 46c61eb4709..ea7382300cc 100644 --- a/storage/myisam/mi_create.c +++ b/storage/myisam/mi_create.c @@ -269,7 +269,7 @@ int mi_create(const char *name,uint keys,MI_KEYDEF *keydefs, keyseg->type != HA_KEYTYPE_VARBINARY2) { my_errno=HA_WRONG_CREATE_OPTION; - goto err; + goto err_no_lock; } } keydef->keysegs+=sp_segs; @@ -278,7 +278,7 @@ int mi_create(const char *name,uint keys,MI_KEYDEF *keydefs, min_key_length_skip+=SPLEN*2*SPDIMS; #else my_errno= HA_ERR_UNSUPPORTED; - goto err; + goto err_no_lock; #endif /*HAVE_SPATIAL*/ } else if (keydef->flag & HA_FULLTEXT) @@ -294,7 +294,7 @@ int mi_create(const char *name,uint keys,MI_KEYDEF *keydefs, keyseg->type != HA_KEYTYPE_VARTEXT2) { my_errno=HA_WRONG_CREATE_OPTION; - goto err; + goto err_no_lock; } if (!(keyseg->flag & HA_BLOB_PART) && (keyseg->type == HA_KEYTYPE_VARTEXT1 || @@ -419,7 +419,7 @@ int mi_create(const char *name,uint keys,MI_KEYDEF *keydefs, if (keydef->keysegs > MI_MAX_KEY_SEG) { my_errno=HA_WRONG_CREATE_OPTION; - goto err; + goto err_no_lock; } /* key_segs may be 0 in the case when we only want to be able to @@ -444,7 +444,7 @@ int mi_create(const char *name,uint keys,MI_KEYDEF *keydefs, length >= MI_MAX_KEY_BUFF) { my_errno=HA_WRONG_CREATE_OPTION; - goto err; + goto err_no_lock; } set_if_bigger(max_key_block_length,keydef->block_length); keydef->keylength= (uint16) key_length; @@ -491,7 +491,7 @@ int mi_create(const char *name,uint keys,MI_KEYDEF *keydefs, "indexes and/or unique constraints.", MYF(0), name + dirname_length(name)); my_errno= HA_WRONG_CREATE_OPTION; - goto err; + goto err_no_lock; } bmove(share.state.header.file_version,(uchar*) myisam_file_magic,4); @@ -810,12 +810,14 @@ int mi_create(const char *name,uint keys,MI_KEYDEF *keydefs, errpos=0; mysql_mutex_unlock(&THR_LOCK_myisam); if (mysql_file_close(file, MYF(0))) - goto err; + goto err_no_lock; my_free(rec_per_key_part); DBUG_RETURN(0); err: mysql_mutex_unlock(&THR_LOCK_myisam); + +err_no_lock: save_errno=my_errno; switch (errpos) { case 3: |