diff options
88 files changed, 4335 insertions, 2954 deletions
diff --git a/storage/xtradb/ChangeLog b/storage/xtradb/ChangeLog index 102db3d7824..f873f3a24bd 100644 --- a/storage/xtradb/ChangeLog +++ b/storage/xtradb/ChangeLog @@ -1,3 +1,105 @@ +2011-08-08 The InnoDB Team + + * row/row0sel.c: + Fix Bug#12835650 VARCHAR maximum length performance impact + +2011-08-08 The InnoDB Team + + * handler/ha_innodb.cc: + Fix Bug#12770537 I_S.TABLES.DATA_LENGTH DOES NOT SHOW ON-DISK SIZE + FOR COMPRESSED INNODB + +2011-07-19 The InnoDB Team + + * buf/buf0buf.c, buf/buf0rea.c, handler/ha_innodb.cc, + include/buf0buf.h, include/buf0buf.ic, include/srv0srv.h, + srv/srv0srv.c: + Fix Bug#12356373 by reintroducing random readahead + +2011-06-30 The InnoDB Team + + * row/row0row.c: + Fix Bug#12637786 Wrong secondary index entries on CHAR and VARCHAR + columns in ROW_FORMAT=DYNAMIC and ROW_FORMAT=COMPRESSED + +2011-06-16 The InnoDB Team + + * btr/btr0cur.c, buf/buf0buddy.c, buf/buf0buf.c, buf/buf0lru.c, + include/buf0buddy.h, include/buf0buddy.ic, include/buf0buf.h, + include/buf0buf.ic, include/buf0lru.h, include/buf0types.h: + Fix Bug#61188 DROP TABLE extremely slow + +2011-06-16 The InnoDB Team + + * buf/buf0buddy.c, buf/buf0buf.c, buf/buf0flu.c, buf/buf0lru.c, + include/buf0buf.h, include/buf0lru.h: + Fix Bug#61341 buf_LRU_insert_zip_clean can be O(N) on LRU length + +2011-06-16 The InnoDB Team + + * page/page0zip.c, rem/rem0rec.c: + Fix Bug#61191 question about page_zip_available() + +2011-06-16 The InnoDB Team + + * btr/btr0btr.c, btr/btr0cur.c, include/btr0btr.h, include/btr0cur.h, + include/btr0cur.ic, include/buf0buf.h, include/buf0buf.ic, + include/page0cur.ic, include/page0page.h, include/page0page.ic, + include/sync0rw.ic, include/sync0sync.h, page/page0cur.c, + page/page0page.c, row/row0ins.c, row/row0upd.c, + sync/sync0rw.c, sync/sync0sync.c: + Fix Bug#12612184 Race condition after btr_cur_pessimistic_update() + +2011-06-09 The InnoDB Team + * btr/btr0cur.c, include/rem0rec.h, include/rem0rec.ic, + * row/row0row.c, row/row0vers.c, trx/trx0rec.c: + Instrumentation for Bug#12612184 Race condition in row_upd_clust_rec() + +2011-05-19 The InnoDB Team + + * row/row0row.c: + Fix Bug#12429576 Assertion failure on purge of column prefix index + +2011-04-07 The InnoDB Team + + * handler/ha_innodb.cc, handler/ha_innodb.h, handler/handler0alter.cc: + Fix Bug #52409 Assertion failure: long semaphore wait + +2011-04-07 The InnoDB Team + + * handler/ha_innodb.cc, include/trx0trx.h, include/trx0undo.h, + log/log0log.c, trx/trx0sys.c, trx/trx0trx.c, trx/trx0undo.c: + Fix Bug #59641 Prepared XA transaction in system after hard crash + causes future shutdown hang + +2011-03-30 The InnoDB Team + + * srv/srv0srv.c, sync/sync0arr.h, sync/sync0arr.c: + Fix Bug#11877216 InnoDB too eager to commit suicide on a busy server + +2011-03-15 The InnoDB Team + + * btr/btr0cur.c, page/page0zip.c: + Fix Bug#11849231 inflateInit() invoked without initializing all memory + +2011-02-28 The InnoDB Team + + * btr/btr0sea.c, buf/buf0buf.c, buf/buf0lru.c: + Fix Bug#58549 Race condition in buf_LRU_drop_page_hash_for_tablespace() + and compressed tables + +2011-02-15 The InnoDB Team + + * sync/sync0rw.c, innodb_bug59307.test: + Bug#59307 Valgrind: uninitialized value in + rw_lock_set_writer_id_and_recursion_flag() + +2011-02-14 The InnoDB Team + + * handler/handler0alter.cc: + Bug#59749 Enabling concurrent reads while creating non-primary + unique index gives failures + 2011-01-31 The InnoDB Team * btr/btr0cur.c, include/row0upd.h, diff --git a/storage/xtradb/btr/btr0btr.c b/storage/xtradb/btr/btr0btr.c index 2fb14b06a7b..396ad422010 100644 --- a/storage/xtradb/btr/btr0btr.c +++ b/storage/xtradb/btr/btr0btr.c @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved. +Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -42,6 +42,560 @@ Created 6/2/1994 Heikki Tuuri #include "ibuf0ibuf.h" #include "trx0trx.h" +#ifdef UNIV_BLOB_DEBUG +# include "srv0srv.h" +# include "ut0rbt.h" + +/** TRUE when messages about index->blobs modification are enabled. */ +static ibool btr_blob_dbg_msg; + +/** Issue a message about an operation on index->blobs. +@param op operation +@param b the entry being subjected to the operation +@param ctx the context of the operation */ +#define btr_blob_dbg_msg_issue(op, b, ctx) \ + fprintf(stderr, op " %u:%u:%u->%u %s(%u,%u,%u)\n", \ + (b)->ref_page_no, (b)->ref_heap_no, \ + (b)->ref_field_no, (b)->blob_page_no, ctx, \ + (b)->owner, (b)->always_owner, (b)->del) + +/** Insert to index->blobs a reference to an off-page column. +@param index the index tree +@param b the reference +@param ctx context (for logging) */ +UNIV_INTERN +void +btr_blob_dbg_rbt_insert( +/*====================*/ + dict_index_t* index, /*!< in/out: index tree */ + const btr_blob_dbg_t* b, /*!< in: the reference */ + const char* ctx) /*!< in: context (for logging) */ +{ + if (btr_blob_dbg_msg) { + btr_blob_dbg_msg_issue("insert", b, ctx); + } + mutex_enter(&index->blobs_mutex); + rbt_insert(index->blobs, b, b); + mutex_exit(&index->blobs_mutex); +} + +/** Remove from index->blobs a reference to an off-page column. +@param index the index tree +@param b the reference +@param ctx context (for logging) */ +UNIV_INTERN +void +btr_blob_dbg_rbt_delete( +/*====================*/ + dict_index_t* index, /*!< in/out: index tree */ + const btr_blob_dbg_t* b, /*!< in: the reference */ + const char* ctx) /*!< in: context (for logging) */ +{ + if (btr_blob_dbg_msg) { + btr_blob_dbg_msg_issue("delete", b, ctx); + } + mutex_enter(&index->blobs_mutex); + ut_a(rbt_delete(index->blobs, b)); + mutex_exit(&index->blobs_mutex); +} + +/**************************************************************//** +Comparator for items (btr_blob_dbg_t) in index->blobs. +The key in index->blobs is (ref_page_no, ref_heap_no, ref_field_no). +@return negative, 0 or positive if *a<*b, *a=*b, *a>*b */ +static +int +btr_blob_dbg_cmp( +/*=============*/ + const void* a, /*!< in: first btr_blob_dbg_t to compare */ + const void* b) /*!< in: second btr_blob_dbg_t to compare */ +{ + const btr_blob_dbg_t* aa = a; + const btr_blob_dbg_t* bb = b; + + ut_ad(aa != NULL); + ut_ad(bb != NULL); + + if (aa->ref_page_no != bb->ref_page_no) { + return(aa->ref_page_no < bb->ref_page_no ? -1 : 1); + } + if (aa->ref_heap_no != bb->ref_heap_no) { + return(aa->ref_heap_no < bb->ref_heap_no ? -1 : 1); + } + if (aa->ref_field_no != bb->ref_field_no) { + return(aa->ref_field_no < bb->ref_field_no ? -1 : 1); + } + return(0); +} + +/**************************************************************//** +Add a reference to an off-page column to the index->blobs map. */ +UNIV_INTERN +void +btr_blob_dbg_add_blob( +/*==================*/ + const rec_t* rec, /*!< in: clustered index record */ + ulint field_no, /*!< in: off-page column number */ + ulint page_no, /*!< in: start page of the column */ + dict_index_t* index, /*!< in/out: index tree */ + const char* ctx) /*!< in: context (for logging) */ +{ + btr_blob_dbg_t b; + const page_t* page = page_align(rec); + + ut_a(index->blobs); + + b.blob_page_no = page_no; + b.ref_page_no = page_get_page_no(page); + b.ref_heap_no = page_rec_get_heap_no(rec); + b.ref_field_no = field_no; + ut_a(b.ref_field_no >= index->n_uniq); + b.always_owner = b.owner = TRUE; + b.del = FALSE; + ut_a(!rec_get_deleted_flag(rec, page_is_comp(page))); + btr_blob_dbg_rbt_insert(index, &b, ctx); +} + +/**************************************************************//** +Add to index->blobs any references to off-page columns from a record. +@return number of references added */ +UNIV_INTERN +ulint +btr_blob_dbg_add_rec( +/*=================*/ + const rec_t* rec, /*!< in: record */ + dict_index_t* index, /*!< in/out: index */ + const ulint* offsets,/*!< in: offsets */ + const char* ctx) /*!< in: context (for logging) */ +{ + ulint count = 0; + ulint i; + btr_blob_dbg_t b; + ibool del; + + ut_ad(rec_offs_validate(rec, index, offsets)); + + if (!rec_offs_any_extern(offsets)) { + return(0); + } + + b.ref_page_no = page_get_page_no(page_align(rec)); + b.ref_heap_no = page_rec_get_heap_no(rec); + del = (rec_get_deleted_flag(rec, rec_offs_comp(offsets)) != 0); + + for (i = 0; i < rec_offs_n_fields(offsets); i++) { + if (rec_offs_nth_extern(offsets, i)) { + ulint len; + const byte* field_ref = rec_get_nth_field( + rec, offsets, i, &len); + + ut_a(len != UNIV_SQL_NULL); + ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE); + field_ref += len - BTR_EXTERN_FIELD_REF_SIZE; + + if (!memcmp(field_ref, field_ref_zero, + BTR_EXTERN_FIELD_REF_SIZE)) { + /* the column has not been stored yet */ + continue; + } + + b.ref_field_no = i; + b.blob_page_no = mach_read_from_4( + field_ref + BTR_EXTERN_PAGE_NO); + ut_a(b.ref_field_no >= index->n_uniq); + b.always_owner = b.owner + = !(field_ref[BTR_EXTERN_LEN] + & BTR_EXTERN_OWNER_FLAG); + b.del = del; + + btr_blob_dbg_rbt_insert(index, &b, ctx); + count++; + } + } + + return(count); +} + +/**************************************************************//** +Display the references to off-page columns. +This function is to be called from a debugger, +for example when a breakpoint on ut_dbg_assertion_failed is hit. */ +UNIV_INTERN +void +btr_blob_dbg_print( +/*===============*/ + const dict_index_t* index) /*!< in: index tree */ +{ + const ib_rbt_node_t* node; + + if (!index->blobs) { + return; + } + + /* We intentionally do not acquire index->blobs_mutex here. + This function is to be called from a debugger, and the caller + should make sure that the index->blobs_mutex is held. */ + + for (node = rbt_first(index->blobs); + node != NULL; node = rbt_next(index->blobs, node)) { + const btr_blob_dbg_t* b + = rbt_value(btr_blob_dbg_t, node); + fprintf(stderr, "%u:%u:%u->%u%s%s%s\n", + b->ref_page_no, b->ref_heap_no, b->ref_field_no, + b->blob_page_no, + b->owner ? "" : "(disowned)", + b->always_owner ? "" : "(has disowned)", + b->del ? "(deleted)" : ""); + } +} + +/**************************************************************//** +Remove from index->blobs any references to off-page columns from a record. +@return number of references removed */ +UNIV_INTERN +ulint +btr_blob_dbg_remove_rec( +/*====================*/ + const rec_t* rec, /*!< in: record */ + dict_index_t* index, /*!< in/out: index */ + const ulint* offsets,/*!< in: offsets */ + const char* ctx) /*!< in: context (for logging) */ +{ + ulint i; + ulint count = 0; + btr_blob_dbg_t b; + + ut_ad(rec_offs_validate(rec, index, offsets)); + + if (!rec_offs_any_extern(offsets)) { + return(0); + } + + b.ref_page_no = page_get_page_no(page_align(rec)); + b.ref_heap_no = page_rec_get_heap_no(rec); + + for (i = 0; i < rec_offs_n_fields(offsets); i++) { + if (rec_offs_nth_extern(offsets, i)) { + ulint len; + const byte* field_ref = rec_get_nth_field( + rec, offsets, i, &len); + + ut_a(len != UNIV_SQL_NULL); + ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE); + field_ref += len - BTR_EXTERN_FIELD_REF_SIZE; + + b.ref_field_no = i; + b.blob_page_no = mach_read_from_4( + field_ref + BTR_EXTERN_PAGE_NO); + + switch (b.blob_page_no) { + case 0: + /* The column has not been stored yet. + The BLOB pointer must be all zero. + There cannot be a BLOB starting at + page 0, because page 0 is reserved for + the tablespace header. */ + ut_a(!memcmp(field_ref, field_ref_zero, + BTR_EXTERN_FIELD_REF_SIZE)); + /* fall through */ + case FIL_NULL: + /* the column has been freed already */ + continue; + } + + btr_blob_dbg_rbt_delete(index, &b, ctx); + count++; + } + } + + return(count); +} + +/**************************************************************//** +Check that there are no references to off-page columns from or to +the given page. Invoked when freeing or clearing a page. +@return TRUE when no orphan references exist */ +UNIV_INTERN +ibool +btr_blob_dbg_is_empty( +/*==================*/ + dict_index_t* index, /*!< in: index */ + ulint page_no) /*!< in: page number */ +{ + const ib_rbt_node_t* node; + ibool success = TRUE; + + if (!index->blobs) { + return(success); + } + + mutex_enter(&index->blobs_mutex); + + for (node = rbt_first(index->blobs); + node != NULL; node = rbt_next(index->blobs, node)) { + const btr_blob_dbg_t* b + = rbt_value(btr_blob_dbg_t, node); + + if (b->ref_page_no != page_no && b->blob_page_no != page_no) { + continue; + } + + fprintf(stderr, + "InnoDB: orphan BLOB ref%s%s%s %u:%u:%u->%u\n", + b->owner ? "" : "(disowned)", + b->always_owner ? "" : "(has disowned)", + b->del ? "(deleted)" : "", + b->ref_page_no, b->ref_heap_no, b->ref_field_no, + b->blob_page_no); + + if (b->blob_page_no != page_no || b->owner || !b->del) { + success = FALSE; + } + } + + mutex_exit(&index->blobs_mutex); + return(success); +} + +/**************************************************************//** +Count and process all references to off-page columns on a page. +@return number of references processed */ +UNIV_INTERN +ulint +btr_blob_dbg_op( +/*============*/ + const page_t* page, /*!< in: B-tree leaf page */ + const rec_t* rec, /*!< in: record to start from + (NULL to process the whole page) */ + dict_index_t* index, /*!< in/out: index */ + const char* ctx, /*!< in: context (for logging) */ + const btr_blob_dbg_op_f op) /*!< in: operation on records */ +{ + ulint count = 0; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + ut_a(fil_page_get_type(page) == FIL_PAGE_INDEX); + ut_a(!rec || page_align(rec) == page); + + if (!index->blobs || !page_is_leaf(page) + || !dict_index_is_clust(index)) { + return(0); + } + + if (rec == NULL) { + rec = page_get_infimum_rec(page); + } + + do { + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + count += op(rec, index, offsets, ctx); + rec = page_rec_get_next_const(rec); + } while (!page_rec_is_supremum(rec)); + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + return(count); +} + +/**************************************************************//** +Count and add to index->blobs any references to off-page columns +from records on a page. +@return number of references added */ +UNIV_INTERN +ulint +btr_blob_dbg_add( +/*=============*/ + const page_t* page, /*!< in: rewritten page */ + dict_index_t* index, /*!< in/out: index */ + const char* ctx) /*!< in: context (for logging) */ +{ + btr_blob_dbg_assert_empty(index, page_get_page_no(page)); + + return(btr_blob_dbg_op(page, NULL, index, ctx, btr_blob_dbg_add_rec)); +} + +/**************************************************************//** +Count and remove from index->blobs any references to off-page columns +from records on a page. +Used when reorganizing a page, before copying the records. +@return number of references removed */ +UNIV_INTERN +ulint +btr_blob_dbg_remove( +/*================*/ + const page_t* page, /*!< in: b-tree page */ + dict_index_t* index, /*!< in/out: index */ + const char* ctx) /*!< in: context (for logging) */ +{ + ulint count; + + count = btr_blob_dbg_op(page, NULL, index, ctx, + btr_blob_dbg_remove_rec); + + /* Check that no references exist. */ + btr_blob_dbg_assert_empty(index, page_get_page_no(page)); + + return(count); +} + +/**************************************************************//** +Restore in index->blobs any references to off-page columns +Used when page reorganize fails due to compressed page overflow. */ +UNIV_INTERN +void +btr_blob_dbg_restore( +/*=================*/ + const page_t* npage, /*!< in: page that failed to compress */ + const page_t* page, /*!< in: copy of original page */ + dict_index_t* index, /*!< in/out: index */ + const char* ctx) /*!< in: context (for logging) */ +{ + ulint removed; + ulint added; + + ut_a(page_get_page_no(npage) == page_get_page_no(page)); + ut_a(page_get_space_id(npage) == page_get_space_id(page)); + + removed = btr_blob_dbg_remove(npage, index, ctx); + added = btr_blob_dbg_add(page, index, ctx); + ut_a(added == removed); +} + +/**************************************************************//** +Modify the 'deleted' flag of a record. */ +UNIV_INTERN +void +btr_blob_dbg_set_deleted_flag( +/*==========================*/ + const rec_t* rec, /*!< in: record */ + dict_index_t* index, /*!< in/out: index */ + const ulint* offsets,/*!< in: rec_get_offs(rec, index) */ + ibool del) /*!< in: TRUE=deleted, FALSE=exists */ +{ + const ib_rbt_node_t* node; + btr_blob_dbg_t b; + btr_blob_dbg_t* c; + ulint i; + + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_a(dict_index_is_clust(index)); + ut_a(del == !!del);/* must be FALSE==0 or TRUE==1 */ + + if (!rec_offs_any_extern(offsets) || !index->blobs) { + + return; + } + + b.ref_page_no = page_get_page_no(page_align(rec)); + b.ref_heap_no = page_rec_get_heap_no(rec); + + for (i = 0; i < rec_offs_n_fields(offsets); i++) { + if (rec_offs_nth_extern(offsets, i)) { + ulint len; + const byte* field_ref = rec_get_nth_field( + rec, offsets, i, &len); + + ut_a(len != UNIV_SQL_NULL); + ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE); + field_ref += len - BTR_EXTERN_FIELD_REF_SIZE; + + b.ref_field_no = i; + b.blob_page_no = mach_read_from_4( + field_ref + BTR_EXTERN_PAGE_NO); + + switch (b.blob_page_no) { + case 0: + ut_a(memcmp(field_ref, field_ref_zero, + BTR_EXTERN_FIELD_REF_SIZE)); + /* page number 0 is for the + page allocation bitmap */ + case FIL_NULL: + /* the column has been freed already */ + ut_error; + } + + mutex_enter(&index->blobs_mutex); + node = rbt_lookup(index->blobs, &b); + ut_a(node); + + c = rbt_value(btr_blob_dbg_t, node); + /* The flag should be modified. */ + c->del = del; + if (btr_blob_dbg_msg) { + b = *c; + mutex_exit(&index->blobs_mutex); + btr_blob_dbg_msg_issue("del_mk", &b, ""); + } else { + mutex_exit(&index->blobs_mutex); + } + } + } +} + +/**************************************************************//** +Change the ownership of an off-page column. */ +UNIV_INTERN +void +btr_blob_dbg_owner( +/*===============*/ + const rec_t* rec, /*!< in: record */ + dict_index_t* index, /*!< in/out: index */ + const ulint* offsets,/*!< in: rec_get_offs(rec, index) */ + ulint i, /*!< in: ith field in rec */ + ibool own) /*!< in: TRUE=owned, FALSE=disowned */ +{ + const ib_rbt_node_t* node; + btr_blob_dbg_t b; + const byte* field_ref; + ulint len; + + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_a(rec_offs_nth_extern(offsets, i)); + + field_ref = rec_get_nth_field(rec, offsets, i, &len); + ut_a(len != UNIV_SQL_NULL); + ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE); + field_ref += len - BTR_EXTERN_FIELD_REF_SIZE; + + b.ref_page_no = page_get_page_no(page_align(rec)); + b.ref_heap_no = page_rec_get_heap_no(rec); + b.ref_field_no = i; + b.owner = !(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG); + b.blob_page_no = mach_read_from_4(field_ref + BTR_EXTERN_PAGE_NO); + + ut_a(b.owner == own); + + mutex_enter(&index->blobs_mutex); + node = rbt_lookup(index->blobs, &b); + /* row_ins_clust_index_entry_by_modify() invokes + btr_cur_unmark_extern_fields() also for the newly inserted + references, which are all zero bytes until the columns are stored. + The node lookup must fail if and only if that is the case. */ + ut_a(!memcmp(field_ref, field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE) + == !node); + + if (node) { + btr_blob_dbg_t* c = rbt_value(btr_blob_dbg_t, node); + /* Some code sets ownership from TRUE to TRUE. + We do not allow changing ownership from FALSE to FALSE. */ + ut_a(own || c->owner); + + c->owner = own; + if (!own) { + c->always_owner = FALSE; + } + } + + mutex_exit(&index->blobs_mutex); +} +#endif /* UNIV_BLOB_DEBUG */ + /* Latching strategy of the InnoDB B-tree -------------------------------------- @@ -289,7 +843,7 @@ btr_get_next_user_rec( /**************************************************************//** Creates a new index page (not the root, and also not used in page reorganization). @see btr_page_empty(). */ -static +UNIV_INTERN void btr_page_create( /*============*/ @@ -302,6 +856,7 @@ btr_page_create( page_t* page = buf_block_get_frame(block); ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + btr_blob_dbg_assert_empty(index, buf_block_get_page_no(block)); if (UNIV_LIKELY_NULL(page_zip)) { page_create_zip(block, index, level, mtr); @@ -501,6 +1056,7 @@ btr_page_free_low( modify clock */ buf_block_modify_clock_inc(block); + btr_blob_dbg_assert_empty(index, buf_block_get_page_no(block)); if (dict_index_is_ibuf(index)) { @@ -785,6 +1341,13 @@ btr_create( block = buf_page_get(space, zip_size, page_no, RW_X_LATCH, mtr); } else { +#ifdef UNIV_BLOB_DEBUG + if ((type & DICT_CLUSTERED) && !index->blobs) { + mutex_create(&index->blobs_mutex, SYNC_ANY_LATCH); + index->blobs = rbt_create(sizeof(btr_blob_dbg_t), + btr_blob_dbg_cmp); + } +#endif /* UNIV_BLOB_DEBUG */ block = fseg_create(space, 0, PAGE_HEADER + PAGE_BTR_SEG_TOP, mtr); } @@ -1026,6 +1589,7 @@ btr_page_reorganize_low( block->check_index_page_at_flush = TRUE; #endif /* !UNIV_HOTBACKUP */ + btr_blob_dbg_remove(page, index, "btr_page_reorganize"); /* Recreate the page: note that global data on page (possible segment headers, next page-field, etc.) is preserved intact */ @@ -1054,6 +1618,8 @@ btr_page_reorganize_low( (!page_zip_compress(page_zip, page, index, NULL))) { /* Restore the old page and exit. */ + btr_blob_dbg_restore(page, temp_page, index, + "btr_page_reorganize_compress_fail"); #if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG /* Check that the bytes that we skip are identical. */ @@ -1168,7 +1734,7 @@ btr_parse_page_reorganize( #ifndef UNIV_HOTBACKUP /*************************************************************//** Empties an index page. @see btr_page_create(). */ -static +UNIV_INTERN void btr_page_empty( /*===========*/ @@ -1187,6 +1753,7 @@ btr_page_empty( #endif /* UNIV_ZIP_DEBUG */ btr_search_drop_page_hash_index(block); + btr_blob_dbg_remove(page, index, "btr_page_empty"); /* Recreate the page: note that global data on page (possible segment headers, next page-field, etc.) is preserved intact */ @@ -1729,13 +2296,13 @@ btr_insert_on_non_leaf_level_func( /**************************************************************//** Attaches the halves of an index page on the appropriate level in an index tree. */ -static +UNIV_INTERN void btr_attach_half_pages( /*==================*/ dict_index_t* index, /*!< in: the index tree */ buf_block_t* block, /*!< in/out: page to be split */ - rec_t* split_rec, /*!< in: first record on upper + const rec_t* split_rec, /*!< in: first record on upper half page */ buf_block_t* new_block, /*!< in/out: the new half page */ ulint direction, /*!< in: FSP_UP or FSP_DOWN */ @@ -2427,15 +2994,16 @@ btr_node_ptr_delete( ut_a(err == DB_SUCCESS); if (!compressed) { - btr_cur_compress_if_useful(&cursor, mtr); + btr_cur_compress_if_useful(&cursor, FALSE, mtr); } } /*************************************************************//** If page is the only on its level, this function moves its records to the -father page, thus reducing the tree height. */ +father page, thus reducing the tree height. +@return father block */ static -void +buf_block_t* btr_lift_page_up( /*=============*/ dict_index_t* index, /*!< in: index tree */ @@ -2527,6 +3095,7 @@ btr_lift_page_up( index); } + btr_blob_dbg_remove(page, index, "btr_lift_page_up"); lock_update_copy_and_discard(father_block, block); /* Go upward to root page, decrementing levels by one. */ @@ -2551,6 +3120,8 @@ btr_lift_page_up( } ut_ad(page_validate(father_page, index)); ut_ad(btr_check_node_ptr(index, father_block, mtr)); + + return(father_block); } /*************************************************************//** @@ -2567,11 +3138,13 @@ UNIV_INTERN ibool btr_compress( /*=========*/ - btr_cur_t* cursor, /*!< in: cursor on the page to merge or lift; - the page must not be empty: in record delete - use btr_discard_page if the page would become - empty */ - mtr_t* mtr) /*!< in: mtr */ + btr_cur_t* cursor, /*!< in/out: cursor on the page to merge + or lift; the page must not be empty: + when deleting records, use btr_discard_page() + if the page would become empty */ + ibool adjust, /*!< in: TRUE if should adjust the + cursor position even if compression occurs */ + mtr_t* mtr) /*!< in/out: mini-transaction */ { dict_index_t* index; ulint space; @@ -2589,12 +3162,14 @@ btr_compress( ulint* offsets; ulint data_size; ulint n_recs; + ulint nth_rec = 0; /* remove bogus warning */ ulint max_ins_size; ulint max_ins_size_reorg; block = btr_cur_get_block(cursor); page = btr_cur_get_page(cursor); index = btr_cur_get_index(cursor); + ut_a((ibool) !!page_is_comp(page) == dict_table_is_comp(index->table)); ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index), @@ -2615,6 +3190,10 @@ btr_compress( offsets = btr_page_get_father_block(NULL, heap, index, block, mtr, &father_cursor); + if (adjust) { + nth_rec = page_rec_get_n_recs_before(btr_cur_get_rec(cursor)); + } + /* Decide the page to which we try to merge and which will inherit the locks */ @@ -2641,9 +3220,9 @@ btr_compress( } else { /* The page is the only one on the level, lift the records to the father */ - btr_lift_page_up(index, block, mtr); - mem_heap_free(heap); - return(TRUE); + + merge_block = btr_lift_page_up(index, block, mtr); + goto func_exit; } n_recs = page_get_n_recs(page); @@ -2725,6 +3304,10 @@ err_exit: btr_node_ptr_delete(index, block, mtr); lock_update_merge_left(merge_block, orig_pred, block); + + if (adjust) { + nth_rec += page_rec_get_n_recs_before(orig_pred); + } } else { rec_t* orig_succ; #ifdef UNIV_BTR_DEBUG @@ -2788,7 +3371,7 @@ err_exit: lock_update_merge_right(merge_block, orig_succ, block); } - mem_heap_free(heap); + btr_blob_dbg_remove(page, index, "btr_compress"); if (!dict_index_is_clust(index) && page_is_leaf(merge_page)) { /* Update the free bits of the B-tree page in the @@ -2840,6 +3423,16 @@ err_exit: btr_page_free(index, block, mtr); ut_ad(btr_check_node_ptr(index, merge_block, mtr)); +func_exit: + mem_heap_free(heap); + + if (adjust) { + btr_cur_position( + index, + page_rec_get_nth(merge_block->frame, nth_rec), + merge_block, cursor); + } + return(TRUE); } @@ -3018,6 +3611,8 @@ btr_discard_page( block); } + btr_blob_dbg_remove(page, index, "btr_discard_page"); + /* Free the file page */ btr_page_free(index, block, mtr); diff --git a/storage/xtradb/btr/btr0cur.c b/storage/xtradb/btr/btr0cur.c index d16522731c4..51a2f784fbf 100644 --- a/storage/xtradb/btr/btr0cur.c +++ b/storage/xtradb/btr/btr0cur.c @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved. +Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2008, Google Inc. Portions of this file contain modifications contributed and copyrighted by @@ -1046,6 +1046,11 @@ btr_cur_ins_lock_and_undo( rec_t* rec; roll_ptr_t roll_ptr; + if (thr && thr_get_trx(thr)->fake_changes) { + /* skip LOCK, UNDO */ + return(DB_SUCCESS); + } + /* Check if we have to wait for a lock: enqueue an explicit lock request if yes */ @@ -1177,7 +1182,7 @@ btr_cur_optimistic_insert( } #endif /* UNIV_DEBUG */ - ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + ut_ad((thr && thr_get_trx(thr)->fake_changes) || mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); max_size = page_get_max_insert_size_after_reorganize(page, 1); leaf = page_is_leaf(page); @@ -1272,6 +1277,12 @@ fail_err: goto fail_err; } + if (thr && thr_get_trx(thr)->fake_changes) { + /* skip CHANGE, LOG */ + *big_rec = big_rec_vec; + return(err); /* == DB_SUCCESS */ + } + page_cursor = btr_cur_get_page_cur(cursor); /* Now, try the insert */ @@ -1414,10 +1425,10 @@ btr_cur_pessimistic_insert( *big_rec = NULL; - ut_ad(mtr_memo_contains(mtr, + ut_ad((thr && thr_get_trx(thr)->fake_changes) || mtr_memo_contains(mtr, dict_index_get_lock(btr_cur_get_index(cursor)), MTR_MEMO_X_LOCK)); - ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor), + ut_ad((thr && thr_get_trx(thr)->fake_changes) || mtr_memo_contains(mtr, btr_cur_get_block(cursor), MTR_MEMO_PAGE_X_FIX)); /* Try first an optimistic insert; reset the cursor flag: we do not @@ -1483,6 +1494,16 @@ btr_cur_pessimistic_insert( } } + if (thr && thr_get_trx(thr)->fake_changes) { + /* skip CHANGE, LOG */ + if (n_extents > 0) { + fil_space_release_free_extents(index->space, + n_reserved); + } + *big_rec = big_rec_vec; + return(DB_SUCCESS); + } + if (dict_index_get_page(index) == buf_block_get_page_no(btr_cur_get_block(cursor))) { @@ -1539,6 +1560,11 @@ btr_cur_upd_lock_and_undo( ut_ad(cursor && update && thr && roll_ptr); + if (thr && thr_get_trx(thr)->fake_changes) { + /* skip LOCK, UNDO */ + return(DB_SUCCESS); + } + rec = btr_cur_get_rec(cursor); index = cursor->index; @@ -1837,6 +1863,14 @@ btr_cur_update_in_place( return(err); } + if (trx->fake_changes) { + /* skip CHANGE, LOG */ + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return(err); /* == DB_SUCCESS */ + } + if (block->is_hashed) { /* The function row_upd_changes_ord_field_binary works only if the update vector was built for a clustered index, we must @@ -1929,7 +1963,6 @@ btr_cur_optimistic_update( ulint old_rec_size; dtuple_t* new_entry; roll_ptr_t roll_ptr; - trx_t* trx; mem_heap_t* heap; ulint i; ulint n_ext; @@ -1940,12 +1973,16 @@ btr_cur_optimistic_update( rec = btr_cur_get_rec(cursor); index = cursor->index; ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table)); - ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + ut_ad((thr && thr_get_trx(thr)->fake_changes) || mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); /* The insert buffer tree should never be updated in place. */ ut_ad(!dict_index_is_ibuf(index)); heap = mem_heap_create(1024); offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap); +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG + ut_a(!rec_offs_any_null_extern(rec, offsets) + || trx_is_recv(thr_get_trx(thr))); +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ #ifdef UNIV_DEBUG if (btr_cur_print_record_ops && thr) { @@ -2050,6 +2087,11 @@ any_extern: goto err_exit; } + if (thr && thr_get_trx(thr)->fake_changes) { + /* skip CHANGE, LOG */ + goto err_exit; /* == DB_SUCCESS */ + } + /* Ok, we may do the replacement. Store on the page infimum the explicit locks on rec, before deleting rec (see the comment in btr_cur_pessimistic_update). */ @@ -2068,13 +2110,11 @@ any_extern: page_cur_move_to_prev(page_cursor); - trx = thr_get_trx(thr); - if (!(flags & BTR_KEEP_SYS_FLAG)) { row_upd_index_entry_sys_field(new_entry, index, DATA_ROLL_PTR, roll_ptr); row_upd_index_entry_sys_field(new_entry, index, DATA_TRX_ID, - trx->id); + thr_get_trx(thr)->id); } /* There are no externally stored columns in new_entry */ @@ -2160,7 +2200,9 @@ btr_cur_pessimistic_update( /*=======================*/ ulint flags, /*!< in: undo logging, locking, and rollback flags */ - btr_cur_t* cursor, /*!< in: cursor on the record to update */ + btr_cur_t* cursor, /*!< in/out: cursor on the record to update; + cursor may become invalid if *big_rec == NULL + || !(flags & BTR_KEEP_POS_FLAG) */ mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */ big_rec_t** big_rec,/*!< out: big rec vector whose fields have to be stored externally by the caller, or NULL */ @@ -2200,9 +2242,9 @@ btr_cur_pessimistic_update( rec = btr_cur_get_rec(cursor); index = cursor->index; - ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index), + ut_ad((thr && thr_get_trx(thr)->fake_changes) || mtr_memo_contains(mtr, dict_index_get_lock(index), MTR_MEMO_X_LOCK)); - ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + ut_ad((thr && thr_get_trx(thr)->fake_changes) || mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); #ifdef UNIV_ZIP_DEBUG ut_a(!page_zip || page_zip_validate(page_zip, page)); #endif /* UNIV_ZIP_DEBUG */ @@ -2290,6 +2332,9 @@ btr_cur_pessimistic_update( ut_ad(big_rec_vec == NULL); + /* fake_changes should not cause undo. so never reaches here */ + ut_ad(!(trx->fake_changes)); + btr_rec_free_updated_extern_fields( index, rec, page_zip, offsets, update, trx_is_recv(trx) ? RB_RECOVERY : RB_NORMAL, mtr); @@ -2299,7 +2344,7 @@ btr_cur_pessimistic_update( record to be inserted: we have to remember which fields were such */ ut_ad(!page_is_comp(page) || !rec_get_node_ptr_flag(rec)); - offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, heap); + ut_ad(rec_offs_validate(rec, index, offsets)); n_ext += btr_push_update_extern_fields(new_entry, update, *heap); if (UNIV_LIKELY_NULL(page_zip)) { @@ -2322,6 +2367,16 @@ make_external: err = DB_TOO_BIG_RECORD; goto return_after_reservations; } + + ut_ad(page_is_leaf(page)); + ut_ad(dict_index_is_clust(index)); + ut_ad(flags & BTR_KEEP_POS_FLAG); + } + + if (trx->fake_changes) { + /* skip CHANGE, LOG */ + err = DB_SUCCESS; + goto return_after_reservations; } /* Store state of explicit locks on rec on the page infimum record, @@ -2349,6 +2404,8 @@ make_external: rec = btr_cur_insert_if_possible(cursor, new_entry, n_ext, mtr); if (rec) { + page_cursor->rec = rec; + lock_rec_restore_from_page_infimum(btr_cur_get_block(cursor), rec, block); @@ -2362,7 +2419,10 @@ make_external: rec, index, offsets, mtr); } - btr_cur_compress_if_useful(cursor, mtr); + btr_cur_compress_if_useful( + cursor, + big_rec_vec != NULL && (flags & BTR_KEEP_POS_FLAG), + mtr); if (page_zip && !dict_index_is_clust(index) && page_is_leaf(page)) { @@ -2382,6 +2442,21 @@ make_external: } } + if (big_rec_vec) { + ut_ad(page_is_leaf(page)); + ut_ad(dict_index_is_clust(index)); + ut_ad(flags & BTR_KEEP_POS_FLAG); + + /* btr_page_split_and_insert() in + btr_cur_pessimistic_insert() invokes + mtr_memo_release(mtr, index->lock, MTR_MEMO_X_LOCK). + We must keep the index->lock when we created a + big_rec, so that row_upd_clust_rec() can store the + big_rec in the same mini-transaction. */ + + mtr_x_lock(dict_index_get_lock(index), mtr); + } + /* Was the record to be updated positioned as the first user record on its page? */ was_first = page_cur_is_before_first(page_cursor); @@ -2397,6 +2472,7 @@ make_external: ut_a(rec); ut_a(err == DB_SUCCESS); ut_a(dummy_big_rec == NULL); + page_cursor->rec = rec; if (dict_index_is_sec_or_ibuf(index)) { /* Update PAGE_MAX_TRX_ID in the index page header. @@ -2455,6 +2531,39 @@ return_after_reservations: return(err); } +/**************************************************************//** +Commits and restarts a mini-transaction so that it will retain an +x-lock on index->lock and the cursor page. */ +UNIV_INTERN +void +btr_cur_mtr_commit_and_start( +/*=========================*/ + btr_cur_t* cursor, /*!< in: cursor */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + buf_block_t* block; + + block = btr_cur_get_block(cursor); + + ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(cursor->index), + MTR_MEMO_X_LOCK)); + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + /* Keep the locks across the mtr_commit(mtr). */ + rw_lock_x_lock(dict_index_get_lock(cursor->index)); + rw_lock_x_lock(&block->lock); + mutex_enter(&block->mutex); + buf_block_buf_fix_inc(block, __FILE__, __LINE__); + mutex_exit(&block->mutex); + /* Write out the redo log. */ + mtr_commit(mtr); + mtr_start(mtr); + /* Reassociate the locks with the mini-transaction. + They will be released on mtr_commit(mtr). */ + mtr_memo_push(mtr, dict_index_get_lock(cursor->index), + MTR_MEMO_X_LOCK); + mtr_memo_push(mtr, block, MTR_MEMO_PAGE_X_FIX); +} + /*==================== B-TREE DELETE MARK AND UNMARK ===============*/ /****************************************************************//** @@ -2625,6 +2734,11 @@ btr_cur_del_mark_set_clust_rec( ut_ad(dict_index_is_clust(index)); ut_ad(!rec_get_deleted_flag(rec, rec_offs_comp(offsets))); + if (thr && thr_get_trx(thr)->fake_changes) { + /* skip LOCK, UNDO, CHANGE, LOG */ + return(DB_SUCCESS); + } + err = lock_clust_rec_modify_check_and_lock(flags, block, rec, index, offsets, thr); @@ -2647,6 +2761,7 @@ btr_cur_del_mark_set_clust_rec( page_zip = buf_block_get_page_zip(block); + btr_blob_dbg_set_deleted_flag(rec, index, offsets, val); btr_rec_set_deleted_flag(rec, page_zip, val); trx = thr_get_trx(thr); @@ -2761,6 +2876,11 @@ btr_cur_del_mark_set_sec_rec( rec_t* rec; ulint err; + if (thr && thr_get_trx(thr)->fake_changes) { + /* skip LOCK, CHANGE, LOG */ + return(DB_SUCCESS); + } + block = btr_cur_get_block(cursor); rec = btr_cur_get_rec(cursor); @@ -2833,10 +2953,12 @@ UNIV_INTERN ibool btr_cur_compress_if_useful( /*=======================*/ - btr_cur_t* cursor, /*!< in: cursor on the page to compress; - cursor does not stay valid if compression - occurs */ - mtr_t* mtr) /*!< in: mtr */ + btr_cur_t* cursor, /*!< in/out: cursor on the page to compress; + cursor does not stay valid if !adjust and + compression occurs */ + ibool adjust, /*!< in: TRUE if should adjust the + cursor position even if compression occurs */ + mtr_t* mtr) /*!< in/out: mini-transaction */ { ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(btr_cur_get_index(cursor)), @@ -2845,7 +2967,7 @@ btr_cur_compress_if_useful( MTR_MEMO_PAGE_X_FIX)); return(btr_cur_compress_recommendation(cursor, mtr) - && btr_compress(cursor, mtr)); + && btr_compress(cursor, adjust, mtr)); } /*******************************************************//** @@ -3092,7 +3214,7 @@ return_after_reservations: mem_heap_free(heap); if (ret == FALSE) { - ret = btr_cur_compress_if_useful(cursor, mtr); + ret = btr_cur_compress_if_useful(cursor, FALSE, mtr); } if (n_extents > 0) { @@ -3116,6 +3238,7 @@ btr_cur_add_path_info( { btr_path_t* slot; rec_t* rec; + page_t* page; ut_a(cursor->path_arr); @@ -3138,8 +3261,155 @@ btr_cur_add_path_info( slot = cursor->path_arr + (root_height - height); + page = page_align(rec); + slot->nth_rec = page_rec_get_n_recs_before(rec); - slot->n_recs = page_get_n_recs(page_align(rec)); + slot->n_recs = page_get_n_recs(page); + slot->page_no = page_get_page_no(page); + slot->page_level = btr_page_get_level_low(page); +} + +/*******************************************************************//** +Estimate the number of rows between slot1 and slot2 for any level on a +B-tree. This function starts from slot1->page and reads a few pages to +the right, counting their records. If we reach slot2->page quickly then +we know exactly how many records there are between slot1 and slot2 and +we set is_n_rows_exact to TRUE. If we cannot reach slot2->page quickly +then we calculate the average number of records in the pages scanned +so far and assume that all pages that we did not scan up to slot2->page +contain the same number of records, then we multiply that average to +the number of pages between slot1->page and slot2->page (which is +n_rows_on_prev_level). In this case we set is_n_rows_exact to FALSE. +@return number of rows (exact or estimated) */ +static +ib_int64_t +btr_estimate_n_rows_in_range_on_level( +/*==================================*/ + dict_index_t* index, /*!< in: index */ + btr_path_t* slot1, /*!< in: left border */ + btr_path_t* slot2, /*!< in: right border */ + ib_int64_t n_rows_on_prev_level, /*!< in: number of rows + on the previous level for the + same descend paths; used to + determine the numbe of pages + on this level */ + ibool* is_n_rows_exact) /*!< out: TRUE if the returned + value is exact i.e. not an + estimation */ +{ + ulint space; + ib_int64_t n_rows; + ulint n_pages_read; + ulint page_no; + ulint zip_size; + ulint level; + + space = dict_index_get_space(index); + + n_rows = 0; + n_pages_read = 0; + + /* Assume by default that we will scan all pages between + slot1->page_no and slot2->page_no */ + *is_n_rows_exact = TRUE; + + /* add records from slot1->page_no which are to the right of + the record which serves as a left border of the range, if any */ + if (slot1->nth_rec < slot1->n_recs) { + n_rows += slot1->n_recs - slot1->nth_rec; + } + + /* add records from slot2->page_no which are to the left of + the record which servers as a right border of the range, if any */ + if (slot2->nth_rec > 1) { + n_rows += slot2->nth_rec - 1; + } + + /* count the records in the pages between slot1->page_no and + slot2->page_no (non inclusive), if any */ + + zip_size = fil_space_get_zip_size(space); + + /* Do not read more than this number of pages in order not to hurt + performance with this code which is just an estimation. If we read + this many pages before reaching slot2->page_no then we estimate the + average from the pages scanned so far */ + #define N_PAGES_READ_LIMIT 10 + + page_no = slot1->page_no; + level = slot1->page_level; + + do { + mtr_t mtr; + page_t* page; + buf_block_t* block; + + mtr_start(&mtr); + + /* fetch the page */ + block = buf_page_get(space, zip_size, page_no, RW_S_LATCH, + &mtr); + + page = buf_block_get_frame(block); + + /* It is possible that the tree has been reorganized in the + meantime and this is a different page. If this happens the + calculated estimate will be bogus, which is not fatal as + this is only an estimate. We are sure that a page with + page_no exists because InnoDB never frees pages, only + reuses them. */ + if (fil_page_get_type(page) != FIL_PAGE_INDEX + || ut_dulint_cmp(btr_page_get_index_id(page), index->id) + || btr_page_get_level_low(page) != level) { + + /* The page got reused for something else */ + goto inexact; + } + + n_pages_read++; + + if (page_no != slot1->page_no) { + /* Do not count the records on slot1->page_no, + we already counted them before this loop. */ + n_rows += page_get_n_recs(page); + } + + page_no = btr_page_get_next(page, &mtr); + + mtr_commit(&mtr); + + if (n_pages_read == N_PAGES_READ_LIMIT + || page_no == FIL_NULL) { + /* Either we read too many pages or + we reached the end of the level without passing + through slot2->page_no, the tree must have changed + in the meantime */ + goto inexact; + } + + } while (page_no != slot2->page_no); + + return(n_rows); + +inexact: + + *is_n_rows_exact = FALSE; + + /* We did interrupt before reaching slot2->page */ + + if (n_pages_read > 0) { + /* The number of pages on this level is + n_rows_on_prev_level, multiply it by the + average number of recs per page so far */ + n_rows = n_rows_on_prev_level + * n_rows / n_pages_read; + } else { + /* The tree changed before we could even + start with slot1->page_no */ + n_rows = 10; + } + + return(n_rows); } /*******************************************************************//** @@ -3164,6 +3434,7 @@ btr_estimate_n_rows_in_range( ibool diverged_lot; ulint divergence_level; ib_int64_t n_rows; + ibool is_n_rows_exact; ulint i; mtr_t mtr; @@ -3206,6 +3477,7 @@ btr_estimate_n_rows_in_range( /* We have the path information for the range in path1 and path2 */ n_rows = 1; + is_n_rows_exact = TRUE; diverged = FALSE; /* This becomes true when the path is not the same any more */ diverged_lot = FALSE; /* This becomes true when the paths are @@ -3221,7 +3493,7 @@ btr_estimate_n_rows_in_range( if (slot1->nth_rec == ULINT_UNDEFINED || slot2->nth_rec == ULINT_UNDEFINED) { - if (i > divergence_level + 1) { + if (i > divergence_level + 1 && !is_n_rows_exact) { /* In trees whose height is > 1 our algorithm tends to underestimate: multiply the estimate by 2: */ @@ -3233,7 +3505,9 @@ btr_estimate_n_rows_in_range( to over 1 / 2 of the estimated rows in the whole table */ - if (n_rows > index->table->stat_n_rows / 2) { + if (n_rows > index->table->stat_n_rows / 2 + && !is_n_rows_exact) { + n_rows = index->table->stat_n_rows / 2; /* If there are just 0 or 1 rows in the table, @@ -3259,10 +3533,15 @@ btr_estimate_n_rows_in_range( divergence_level = i; } } else { - /* Maybe the tree has changed between - searches */ - - return(10); + /* It is possible that + slot1->nth_rec >= slot2->nth_rec + if, for example, we have a single page + tree which contains (inf, 5, 6, supr) + and we select where x > 20 and x < 30; + in this case slot1->nth_rec will point + to the supr record and slot2->nth_rec + will point to 6 */ + n_rows = 0; } } else if (diverged && !diverged_lot) { @@ -3286,8 +3565,9 @@ btr_estimate_n_rows_in_range( } } else if (diverged_lot) { - n_rows = (n_rows * (slot1->n_recs + slot2->n_recs)) - / 2; + n_rows = btr_estimate_n_rows_in_range_on_level( + index, slot1, slot2, n_rows, + &is_n_rows_exact); } } } @@ -3679,6 +3959,8 @@ btr_cur_set_ownership_of_extern_field( } else { mach_write_to_1(data + local_len + BTR_EXTERN_LEN, byte_val); } + + btr_blob_dbg_owner(rec, index, offsets, i, val); } /*******************************************************************//** @@ -3883,7 +4165,7 @@ btr_blob_free( && buf_block_get_space(block) == space && buf_block_get_page_no(block) == page_no) { - if (buf_LRU_free_block(&block->page, all, TRUE) != BUF_LRU_FREED + if (!buf_LRU_free_block(&block->page, all, TRUE) && all && block->page.zip.data /* Now, buf_LRU_free_block() may release mutex temporarily */ && buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE @@ -4184,6 +4466,11 @@ btr_store_big_rec_extern_fields_func( } if (prev_page_no == FIL_NULL) { + btr_blob_dbg_add_blob( + rec, big_rec_vec->fields[i] + .field_no, page_no, index, + "store"); + mach_write_to_4(field_ref + BTR_EXTERN_SPACE_ID, space_id); @@ -4259,6 +4546,11 @@ next_zip_page: MLOG_4BYTES, &mtr); if (prev_page_no == FIL_NULL) { + btr_blob_dbg_add_blob( + rec, big_rec_vec->fields[i] + .field_no, page_no, index, + "store"); + mlog_write_ulint(field_ref + BTR_EXTERN_SPACE_ID, space_id, @@ -4427,6 +4719,37 @@ btr_free_externally_stored_field( rec_zip_size = 0; } +#ifdef UNIV_BLOB_DEBUG + if (!(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG) + && !((field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_INHERITED_FLAG) + && (rb_ctx == RB_NORMAL || rb_ctx == RB_RECOVERY))) { + /* This off-page column will be freed. + Check that no references remain. */ + + btr_blob_dbg_t b; + + b.blob_page_no = mach_read_from_4( + field_ref + BTR_EXTERN_PAGE_NO); + + if (rec) { + /* Remove the reference from the record to the + BLOB. If the BLOB were not freed, the + reference would be removed when the record is + removed. Freeing the BLOB will overwrite the + BTR_EXTERN_PAGE_NO in the field_ref of the + record with FIL_NULL, which would make the + btr_blob_dbg information inconsistent with the + record. */ + b.ref_page_no = page_get_page_no(page_align(rec)); + b.ref_heap_no = page_rec_get_heap_no(rec); + b.ref_field_no = i; + btr_blob_dbg_rbt_delete(index, &b, "free"); + } + + btr_blob_dbg_assert_empty(index, b.blob_page_no); + } +#endif /* UNIV_BLOB_DEBUG */ + for (;;) { #ifdef UNIV_SYNC_DEBUG buf_block_t* rec_block; @@ -4673,27 +4996,45 @@ btr_copy_blob_prefix( /*******************************************************************//** Copies the prefix of a compressed BLOB. The clustered index record -that points to this BLOB must be protected by a lock or a page latch. */ +that points to this BLOB must be protected by a lock or a page latch. +@return number of bytes written to buf */ static -void +ulint btr_copy_zblob_prefix( /*==================*/ - z_stream* d_stream,/*!< in/out: the decompressing stream */ + byte* buf, /*!< out: the externally stored part of + the field, or a prefix of it */ + ulint len, /*!< in: length of buf, in bytes */ ulint zip_size,/*!< in: compressed BLOB page size */ ulint space_id,/*!< in: space id of the BLOB pages */ ulint page_no,/*!< in: page number of the first BLOB page */ ulint offset) /*!< in: offset on the first BLOB page */ { - ulint page_type = FIL_PAGE_TYPE_ZBLOB; + ulint page_type = FIL_PAGE_TYPE_ZBLOB; + mem_heap_t* heap; + int err; + z_stream d_stream; + + d_stream.next_out = buf; + d_stream.avail_out = len; + d_stream.next_in = Z_NULL; + d_stream.avail_in = 0; + + /* Zlib inflate needs 32 kilobytes for the default + window size, plus a few kilobytes for small objects. */ + heap = mem_heap_create(40000); + page_zip_set_alloc(&d_stream, heap); ut_ad(ut_is_2pow(zip_size)); ut_ad(zip_size >= PAGE_ZIP_MIN_SIZE); ut_ad(zip_size <= UNIV_PAGE_SIZE); ut_ad(space_id); + err = inflateInit(&d_stream); + ut_a(err == Z_OK); + for (;;) { buf_page_t* bpage; - int err; ulint next_page_no; /* There is no latch on bpage directly. Instead, @@ -4709,7 +5050,7 @@ btr_copy_zblob_prefix( " compressed BLOB" " page %lu space %lu\n", (ulong) page_no, (ulong) space_id); - return; + goto func_exit; } if (UNIV_UNLIKELY @@ -4735,13 +5076,13 @@ btr_copy_zblob_prefix( offset += 4; } - d_stream->next_in = bpage->zip.data + offset; - d_stream->avail_in = zip_size - offset; + d_stream.next_in = bpage->zip.data + offset; + d_stream.avail_in = zip_size - offset; - err = inflate(d_stream, Z_NO_FLUSH); + err = inflate(&d_stream, Z_NO_FLUSH); switch (err) { case Z_OK: - if (!d_stream->avail_out) { + if (!d_stream.avail_out) { goto end_of_blob; } break; @@ -4758,13 +5099,13 @@ inflate_error: " compressed BLOB" " page %lu space %lu returned %d (%s)\n", (ulong) page_no, (ulong) space_id, - err, d_stream->msg); + err, d_stream.msg); case Z_BUF_ERROR: goto end_of_blob; } if (next_page_no == FIL_NULL) { - if (!d_stream->avail_in) { + if (!d_stream.avail_in) { ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: unexpected end of" @@ -4773,7 +5114,7 @@ inflate_error: (ulong) page_no, (ulong) space_id); } else { - err = inflate(d_stream, Z_FINISH); + err = inflate(&d_stream, Z_FINISH); switch (err) { case Z_STREAM_END: case Z_BUF_ERROR: @@ -4785,7 +5126,7 @@ inflate_error: end_of_blob: buf_page_release_zip(bpage); - return; + goto func_exit; } buf_page_release_zip(bpage); @@ -4797,6 +5138,12 @@ end_of_blob: offset = FIL_PAGE_NEXT; page_type = FIL_PAGE_TYPE_ZBLOB2; } + +func_exit: + inflateEnd(&d_stream); + mem_heap_free(heap); + UNIV_MEM_ASSERT_RW(buf, d_stream.total_out); + return(d_stream.total_out); } /*******************************************************************//** @@ -4822,28 +5169,8 @@ btr_copy_externally_stored_field_prefix_low( } if (UNIV_UNLIKELY(zip_size)) { - int err; - z_stream d_stream; - mem_heap_t* heap; - - /* Zlib inflate needs 32 kilobytes for the default - window size, plus a few kilobytes for small objects. */ - heap = mem_heap_create(40000); - page_zip_set_alloc(&d_stream, heap); - - err = inflateInit(&d_stream); - ut_a(err == Z_OK); - - d_stream.next_out = buf; - d_stream.avail_out = len; - d_stream.avail_in = 0; - - btr_copy_zblob_prefix(&d_stream, zip_size, - space_id, page_no, offset); - inflateEnd(&d_stream); - mem_heap_free(heap); - UNIV_MEM_ASSERT_RW(buf, d_stream.total_out); - return(d_stream.total_out); + return(btr_copy_zblob_prefix(buf, len, zip_size, + space_id, page_no, offset)); } else { return(btr_copy_blob_prefix(buf, len, space_id, page_no, offset)); diff --git a/storage/xtradb/btr/btr0pcur.c b/storage/xtradb/btr/btr0pcur.c index f95a5487c94..97fe06f0f5e 100644 --- a/storage/xtradb/btr/btr0pcur.c +++ b/storage/xtradb/btr/btr0pcur.c @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved. +Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -362,33 +362,6 @@ btr_pcur_restore_position_func( return(FALSE); } -/**************************************************************//** -If the latch mode of the cursor is BTR_LEAF_SEARCH or BTR_LEAF_MODIFY, -releases the page latch and bufferfix reserved by the cursor. -NOTE! In the case of BTR_LEAF_MODIFY, there should not exist changes -made by the current mini-transaction to the data protected by the -cursor latch, as then the latch must not be released until mtr_commit. */ -UNIV_INTERN -void -btr_pcur_release_leaf( -/*==================*/ - btr_pcur_t* cursor, /*!< in: persistent cursor */ - mtr_t* mtr) /*!< in: mtr */ -{ - buf_block_t* block; - - ut_a(cursor->pos_state == BTR_PCUR_IS_POSITIONED); - ut_ad(cursor->latch_mode != BTR_NO_LATCHES); - - block = btr_pcur_get_block(cursor); - - btr_leaf_page_release(block, cursor->latch_mode, mtr); - - cursor->latch_mode = BTR_NO_LATCHES; - - cursor->pos_state = BTR_PCUR_WAS_POSITIONED; -} - /*********************************************************//** Moves the persistent cursor to the first record on the next page. Releases the latch on the current page, and bufferunfixes it. Note that there must not be diff --git a/storage/xtradb/btr/btr0sea.c b/storage/xtradb/btr/btr0sea.c index 3b38e2799c2..6e6c533f4af 100644 --- a/storage/xtradb/btr/btr0sea.c +++ b/storage/xtradb/btr/btr0sea.c @@ -1373,8 +1373,8 @@ btr_search_drop_page_hash_when_freed( having to fear a deadlock. */ block = buf_page_get_gen(space, zip_size, page_no, RW_S_LATCH, NULL, - BUF_GET_IF_IN_POOL, __FILE__, __LINE__, - &mtr); + BUF_PEEK_IF_IN_POOL, __FILE__, __LINE__, + &mtr); /* Because the buffer pool mutex was released by buf_page_peek_if_search_hashed(), it is possible that the block was removed from the buffer pool by another thread diff --git a/storage/xtradb/buf/buf0buddy.c b/storage/xtradb/buf/buf0buddy.c index db94b4bed24..673d6c55efc 100644 --- a/storage/xtradb/buf/buf0buddy.c +++ b/storage/xtradb/buf/buf0buddy.c @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 2006, 2010, Innobase Oy. All Rights Reserved. +Copyright (c) 2006, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -45,6 +45,14 @@ static ulint buf_buddy_n_frames; Protected by buf_pool_mutex. */ UNIV_INTERN buf_buddy_stat_t buf_buddy_stat[BUF_BUDDY_SIZES_MAX + 1]; +/** Validate a given zip_free list. */ +#define BUF_BUDDY_LIST_VALIDATE(i) \ + UT_LIST_VALIDATE(zip_list, buf_page_t, \ + buf_pool->zip_free[i], \ + ut_ad(buf_page_get_state( \ + ut_list_node_313) \ + == BUF_BLOCK_ZIP_FREE)) + /**********************************************************************//** Get the offset of the buddy of a compressed page frame. @return the buddy relative of page */ @@ -76,22 +84,11 @@ buf_buddy_add_to_free( buf_page_t* bpage, /*!< in,own: block to be freed */ ulint i) /*!< in: index of buf_pool->zip_free[] */ { -#ifdef UNIV_DEBUG_VALGRIND - buf_page_t* b = UT_LIST_GET_FIRST(buf_pool->zip_free[i]); - - if (b) UNIV_MEM_VALID(b, BUF_BUDDY_LOW << i); -#endif /* UNIV_DEBUG_VALGRIND */ - //ut_ad(buf_pool_mutex_own()); ut_ad(mutex_own(&zip_free_mutex)); ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_FREE); ut_ad(buf_pool->zip_free[i].start != bpage); UT_LIST_ADD_FIRST(zip_list, buf_pool->zip_free[i], bpage); - -#ifdef UNIV_DEBUG_VALGRIND - if (b) UNIV_MEM_FREE(b, BUF_BUDDY_LOW << i); - UNIV_MEM_ASSERT_AND_FREE(bpage, BUF_BUDDY_LOW << i); -#endif /* UNIV_DEBUG_VALGRIND */ } /**********************************************************************//** @@ -103,26 +100,18 @@ buf_buddy_remove_from_free( buf_page_t* bpage, /*!< in: block to be removed */ ulint i) /*!< in: index of buf_pool->zip_free[] */ { -#ifdef UNIV_DEBUG_VALGRIND +#ifdef UNIV_DEBUG buf_page_t* prev = UT_LIST_GET_PREV(zip_list, bpage); buf_page_t* next = UT_LIST_GET_NEXT(zip_list, bpage); - if (prev) UNIV_MEM_VALID(prev, BUF_BUDDY_LOW << i); - if (next) UNIV_MEM_VALID(next, BUF_BUDDY_LOW << i); - ut_ad(!prev || buf_page_get_state(prev) == BUF_BLOCK_ZIP_FREE); ut_ad(!next || buf_page_get_state(next) == BUF_BLOCK_ZIP_FREE); -#endif /* UNIV_DEBUG_VALGRIND */ +#endif /* UNIV_DEBUG */ //ut_ad(buf_pool_mutex_own()); ut_ad(mutex_own(&zip_free_mutex)); ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_FREE); UT_LIST_REMOVE(zip_list, buf_pool->zip_free[i], bpage); - -#ifdef UNIV_DEBUG_VALGRIND - if (prev) UNIV_MEM_FREE(prev, BUF_BUDDY_LOW << i); - if (next) UNIV_MEM_FREE(next, BUF_BUDDY_LOW << i); -#endif /* UNIV_DEBUG_VALGRIND */ } /**********************************************************************//** @@ -139,17 +128,13 @@ buf_buddy_alloc_zip( //ut_ad(buf_pool_mutex_own()); ut_ad(mutex_own(&zip_free_mutex)); ut_a(i < BUF_BUDDY_SIZES); + ut_a(i >= buf_buddy_get_slot(PAGE_ZIP_MIN_SIZE)); + + ut_d(BUF_BUDDY_LIST_VALIDATE(i)); -#ifndef UNIV_DEBUG_VALGRIND - /* Valgrind would complain about accessing free memory. */ - ut_d(UT_LIST_VALIDATE(zip_list, buf_page_t, buf_pool->zip_free[i], - ut_ad(buf_page_get_state(ut_list_node_313) - == BUF_BLOCK_ZIP_FREE))); -#endif /* !UNIV_DEBUG_VALGRIND */ bpage = UT_LIST_GET_LAST(buf_pool->zip_free[i]); if (bpage) { - UNIV_MEM_VALID(bpage, BUF_BUDDY_LOW << i); ut_a(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_FREE); buf_buddy_remove_from_free(bpage, i); @@ -168,13 +153,10 @@ buf_buddy_alloc_zip( } } -#ifdef UNIV_DEBUG if (bpage) { - memset(bpage, ~i, BUF_BUDDY_LOW << i); + ut_d(memset(bpage, ~i, BUF_BUDDY_LOW << i)); + UNIV_MEM_ALLOC(bpage, BUF_BUDDY_SIZES << i); } -#endif /* UNIV_DEBUG */ - - UNIV_MEM_ALLOC(bpage, BUF_BUDDY_SIZES << i); return(bpage); } @@ -266,6 +248,7 @@ buf_buddy_alloc_from( { ulint offs = BUF_BUDDY_LOW << j; ut_ad(j <= BUF_BUDDY_SIZES); + ut_ad(i >= buf_buddy_get_slot(PAGE_ZIP_MIN_SIZE)); ut_ad(j >= i); ut_ad(!ut_align_offset(buf, offs)); @@ -279,13 +262,7 @@ buf_buddy_alloc_from( bpage = (buf_page_t*) ((byte*) buf + offs); ut_d(memset(bpage, j, BUF_BUDDY_LOW << j)); bpage->state = BUF_BLOCK_ZIP_FREE; -#ifndef UNIV_DEBUG_VALGRIND - /* Valgrind would complain about accessing free memory. */ - ut_d(UT_LIST_VALIDATE(zip_list, buf_page_t, buf_pool->zip_free[i], - ut_ad(buf_page_get_state( - ut_list_node_313) - == BUF_BLOCK_ZIP_FREE))); -#endif /* !UNIV_DEBUG_VALGRIND */ + ut_d(BUF_BUDDY_LIST_VALIDATE(i)); buf_buddy_add_to_free(bpage, j); } @@ -295,8 +272,8 @@ buf_buddy_alloc_from( /**********************************************************************//** Allocate a block. The thread calling this function must hold buf_pool_mutex and must not hold buf_pool_zip_mutex or any block->mutex. -The buf_pool_mutex may only be released and reacquired if lru != NULL. -@return allocated block, possibly NULL if lru==NULL */ +The buf_pool_mutex may be released and reacquired. +@return allocated block, never NULL */ UNIV_INTERN void* buf_buddy_alloc_low( @@ -305,14 +282,15 @@ buf_buddy_alloc_low( or BUF_BUDDY_SIZES */ ibool* lru, /*!< in: pointer to a variable that will be assigned TRUE if storage was allocated from the LRU list - and buf_pool_mutex was temporarily released, - or NULL if the LRU list should not be used */ + and buf_pool_mutex was temporarily released */ ibool have_page_hash_mutex) { buf_block_t* block; + ut_ad(lru); //ut_ad(buf_pool_mutex_own()); ut_ad(!mutex_own(&buf_pool_zip_mutex)); + ut_ad(i >= buf_buddy_get_slot(PAGE_ZIP_MIN_SIZE)); if (i < BUF_BUDDY_SIZES) { /* Try to allocate from the buddy system. */ @@ -335,11 +313,6 @@ buf_buddy_alloc_low( goto alloc_big; } - if (!lru) { - - return(NULL); - } - /* Try replacing an uncompressed page in the buffer pool. */ //buf_pool_mutex_exit(); mutex_exit(&LRU_list_mutex); @@ -368,76 +341,6 @@ func_exit: } /**********************************************************************//** -Try to relocate the control block of a compressed page. -@return TRUE if relocated */ -static -ibool -buf_buddy_relocate_block( -/*=====================*/ - buf_page_t* bpage, /*!< in: block to relocate */ - buf_page_t* dpage) /*!< in: free block to relocate to */ -{ - buf_page_t* b; - - //ut_ad(buf_pool_mutex_own()); -#ifdef UNIV_SYNC_DEBUG - ut_ad(rw_lock_own(&page_hash_latch, RW_LOCK_EX)); -#endif - - switch (buf_page_get_state(bpage)) { - case BUF_BLOCK_ZIP_FREE: - case BUF_BLOCK_NOT_USED: - case BUF_BLOCK_READY_FOR_USE: - case BUF_BLOCK_FILE_PAGE: - case BUF_BLOCK_MEMORY: - case BUF_BLOCK_REMOVE_HASH: - /* ut_error; */ /* optimistic */ - case BUF_BLOCK_ZIP_DIRTY: - /* Cannot relocate dirty pages. */ - return(FALSE); - - case BUF_BLOCK_ZIP_PAGE: - break; - } - - mutex_enter(&buf_pool_zip_mutex); - mutex_enter(&zip_free_mutex); - - if (!buf_page_can_relocate(bpage)) { - mutex_exit(&buf_pool_zip_mutex); - mutex_exit(&zip_free_mutex); - return(FALSE); - } - - if (bpage != buf_page_hash_get(bpage->space, bpage->offset)) { - mutex_exit(&buf_pool_zip_mutex); - mutex_exit(&zip_free_mutex); - return(FALSE); - } - - buf_relocate(bpage, dpage); - ut_d(bpage->state = BUF_BLOCK_ZIP_FREE); - - /* relocate buf_pool->zip_clean */ - mutex_enter(&flush_list_mutex); - b = UT_LIST_GET_PREV(zip_list, dpage); - UT_LIST_REMOVE(zip_list, buf_pool->zip_clean, dpage); - - if (b) { - UT_LIST_INSERT_AFTER(zip_list, buf_pool->zip_clean, b, dpage); - } else { - UT_LIST_ADD_FIRST(zip_list, buf_pool->zip_clean, dpage); - } - mutex_exit(&flush_list_mutex); - - UNIV_MEM_INVALID(bpage, sizeof *bpage); - - mutex_exit(&buf_pool_zip_mutex); - mutex_exit(&zip_free_mutex); - return(TRUE); -} - -/**********************************************************************//** Try to relocate a block. @return TRUE if relocated */ static @@ -452,159 +355,120 @@ buf_buddy_relocate( buf_page_t* bpage; const ulint size = BUF_BUDDY_LOW << i; ullint usec = ut_time_us(NULL); + mutex_t* mutex; + ulint space; + ulint page_no; //ut_ad(buf_pool_mutex_own()); ut_ad(mutex_own(&zip_free_mutex)); ut_ad(!mutex_own(&buf_pool_zip_mutex)); ut_ad(!ut_align_offset(src, size)); ut_ad(!ut_align_offset(dst, size)); + ut_ad(i >= buf_buddy_get_slot(PAGE_ZIP_MIN_SIZE)); UNIV_MEM_ASSERT_W(dst, size); + if (!have_page_hash_mutex) { + mutex_exit(&zip_free_mutex); + mutex_enter(&LRU_list_mutex); + rw_lock_x_lock(&page_hash_latch); + } + /* We assume that all memory from buf_buddy_alloc() - is used for either compressed pages or buf_page_t - objects covering compressed pages. */ + is used for compressed page frames. */ /* We look inside the allocated objects returned by - buf_buddy_alloc() and assume that anything of - PAGE_ZIP_MIN_SIZE or larger is a compressed page that contains - a valid space_id and page_no in the page header. Should the - fields be invalid, we will be unable to relocate the block. - We also assume that anything that fits sizeof(buf_page_t) - actually is a properly initialized buf_page_t object. */ - - if (size >= PAGE_ZIP_MIN_SIZE) { - /* This is a compressed page. */ - mutex_t* mutex; - ulint space, page_no; + buf_buddy_alloc() and assume that each block is a compressed + page that contains a valid space_id and page_no in the page + header. Should the fields be invalid, we will be unable to + relocate the block. */ + + /* The src block may be split into smaller blocks, + some of which may be free. Thus, the + mach_read_from_4() calls below may attempt to read + from free memory. The memory is "owned" by the buddy + allocator (and it has been allocated from the buffer + pool), so there is nothing wrong about this. The + mach_read_from_4() calls here will only trigger bogus + Valgrind memcheck warnings in UNIV_DEBUG_VALGRIND builds. */ + space = mach_read_from_4((const byte *) src + + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); + page_no = mach_read_from_4((const byte *) src + + FIL_PAGE_OFFSET); + /* Suppress Valgrind warnings about conditional jump + on uninitialized value. */ + UNIV_MEM_VALID(&space, sizeof space); + UNIV_MEM_VALID(&page_no, sizeof page_no); + bpage = buf_page_hash_get(space, page_no); + + if (!bpage || bpage->zip.data != src) { + /* The block has probably been freshly + allocated by buf_LRU_get_free_block() but not + added to buf_pool->page_hash yet. Obviously, + it cannot be relocated. */ if (!have_page_hash_mutex) { - mutex_exit(&zip_free_mutex); - mutex_enter(&LRU_list_mutex); - rw_lock_x_lock(&page_hash_latch); - } - - /* The src block may be split into smaller blocks, - some of which may be free. Thus, the - mach_read_from_4() calls below may attempt to read - from free memory. The memory is "owned" by the buddy - allocator (and it has been allocated from the buffer - pool), so there is nothing wrong about this. The - mach_read_from_4() calls here will only trigger bogus - Valgrind memcheck warnings in UNIV_DEBUG_VALGRIND builds. */ - space = mach_read_from_4( - (const byte*) src + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); - page_no = mach_read_from_4( - (const byte*) src + FIL_PAGE_OFFSET); - /* Suppress Valgrind warnings about conditional jump - on uninitialized value. */ - UNIV_MEM_VALID(&space, sizeof space); - UNIV_MEM_VALID(&page_no, sizeof page_no); - bpage = buf_page_hash_get(space, page_no); - - if (!bpage || bpage->zip.data != src) { - /* The block has probably been freshly - allocated by buf_LRU_get_free_block() but not - added to buf_pool->page_hash yet. Obviously, - it cannot be relocated. */ - - if (!have_page_hash_mutex) { - mutex_enter(&zip_free_mutex); - mutex_exit(&LRU_list_mutex); - rw_lock_x_unlock(&page_hash_latch); - } - return(FALSE); - } - - if (page_zip_get_size(&bpage->zip) != size) { - /* The block is of different size. We would - have to relocate all blocks covered by src. - For the sake of simplicity, give up. */ - ut_ad(page_zip_get_size(&bpage->zip) < size); - - if (!have_page_hash_mutex) { - mutex_enter(&zip_free_mutex); - mutex_exit(&LRU_list_mutex); - rw_lock_x_unlock(&page_hash_latch); - } - return(FALSE); + mutex_enter(&zip_free_mutex); + mutex_exit(&LRU_list_mutex); + rw_lock_x_unlock(&page_hash_latch); } + return(FALSE); + } - /* To keep latch order */ - if (have_page_hash_mutex) - mutex_exit(&zip_free_mutex); - - /* The block must have been allocated, but it may - contain uninitialized data. */ - UNIV_MEM_ASSERT_W(src, size); - - mutex = buf_page_get_mutex_enter(bpage); - - mutex_enter(&zip_free_mutex); - - if (mutex && buf_page_can_relocate(bpage)) { - /* Relocate the compressed page. */ - ut_a(bpage->zip.data == src); - memcpy(dst, src, size); - bpage->zip.data = dst; - mutex_exit(mutex); -success: - UNIV_MEM_INVALID(src, size); - { - buf_buddy_stat_t* buddy_stat - = &buf_buddy_stat[i]; - buddy_stat->relocated++; - buddy_stat->relocated_usec - += ut_time_us(NULL) - usec; - } - - if (!have_page_hash_mutex) { - mutex_exit(&LRU_list_mutex); - rw_lock_x_unlock(&page_hash_latch); - } - return(TRUE); - } + if (page_zip_get_size(&bpage->zip) != size) { + /* The block is of different size. We would + have to relocate all blocks covered by src. + For the sake of simplicity, give up. */ + ut_ad(page_zip_get_size(&bpage->zip) < size); if (!have_page_hash_mutex) { + mutex_enter(&zip_free_mutex); mutex_exit(&LRU_list_mutex); rw_lock_x_unlock(&page_hash_latch); } + return(FALSE); + } - if (mutex) { - mutex_exit(mutex); - } - } else if (i == buf_buddy_get_slot(sizeof(buf_page_t))) { - /* This must be a buf_page_t object. */ -#if UNIV_WORD_SIZE == 4 - /* On 32-bit systems, there is no padding in - buf_page_t. On other systems, Valgrind could complain - about uninitialized pad bytes. */ - UNIV_MEM_ASSERT_RW(src, size); -#endif - + /* To keep latch order */ + if (have_page_hash_mutex) mutex_exit(&zip_free_mutex); - if (!have_page_hash_mutex) { - mutex_enter(&LRU_list_mutex); - rw_lock_x_lock(&page_hash_latch); - } + /* The block must have been allocated, but it may + contain uninitialized data. */ + UNIV_MEM_ASSERT_W(src, size); - if (buf_buddy_relocate_block(src, dst)) { - mutex_enter(&zip_free_mutex); + mutex = buf_page_get_mutex_enter(bpage); - if (!have_page_hash_mutex) { - mutex_exit(&LRU_list_mutex); - rw_lock_x_unlock(&page_hash_latch); - } + mutex_enter(&zip_free_mutex); - goto success; + if (mutex && buf_page_can_relocate(bpage)) { + /* Relocate the compressed page. */ + ut_a(bpage->zip.data == src); + memcpy(dst, src, size); + bpage->zip.data = dst; + mutex_exit(mutex); + UNIV_MEM_INVALID(src, size); + { + buf_buddy_stat_t* buddy_stat + = &buf_buddy_stat[i]; + buddy_stat->relocated++; + buddy_stat->relocated_usec + += ut_time_us(NULL) - usec; } - mutex_enter(&zip_free_mutex); - if (!have_page_hash_mutex) { mutex_exit(&LRU_list_mutex); rw_lock_x_unlock(&page_hash_latch); } + return(TRUE); + } + + if (!have_page_hash_mutex) { + mutex_exit(&LRU_list_mutex); + rw_lock_x_unlock(&page_hash_latch); + } + + if (mutex) { + mutex_exit(mutex); } return(FALSE); @@ -629,12 +493,14 @@ buf_buddy_free_low( ut_ad(mutex_own(&zip_free_mutex)); ut_ad(!mutex_own(&buf_pool_zip_mutex)); ut_ad(i <= BUF_BUDDY_SIZES); + ut_ad(i >= buf_buddy_get_slot(PAGE_ZIP_MIN_SIZE)); ut_ad(buf_buddy_stat[i].used > 0); buf_buddy_stat[i].used--; + recombine: UNIV_MEM_ASSERT_AND_ALLOC(buf, BUF_BUDDY_LOW << i); - ut_d(((buf_page_t*) buf)->state = BUF_BLOCK_ZIP_FREE); + ((buf_page_t*) buf)->state = BUF_BLOCK_ZIP_FREE; if (i == BUF_BUDDY_SIZES) { mutex_exit(&zip_free_mutex); @@ -647,32 +513,36 @@ recombine: ut_ad(buf == ut_align_down(buf, BUF_BUDDY_LOW << i)); ut_ad(!buf_pool_contains_zip(buf)); - /* Try to combine adjacent blocks. */ + /* Do not recombine blocks if there are few free blocks. + We may waste up to 15360*max_len bytes to free blocks + (1024 + 2048 + 4096 + 8192 = 15360) */ + if (UT_LIST_GET_LEN(buf_pool->zip_free[i]) < 16) { + goto func_exit; + } + /* Try to combine adjacent blocks. */ buddy = (buf_page_t*) buf_buddy_get(((byte*) buf), BUF_BUDDY_LOW << i); #ifndef UNIV_DEBUG_VALGRIND - /* Valgrind would complain about accessing free memory. */ + /* When Valgrind instrumentation is not enabled, we can read + buddy->state to quickly determine that a block is not free. + When the block is not free, buddy->state belongs to a compressed + page frame that may be flagged uninitialized in our Valgrind + instrumentation. */ if (buddy->state != BUF_BLOCK_ZIP_FREE) { goto buddy_nonfree; } - - /* The field buddy->state can only be trusted for free blocks. - If buddy->state == BUF_BLOCK_ZIP_FREE, the block is free if - it is in the free list. */ #endif /* !UNIV_DEBUG_VALGRIND */ for (bpage = UT_LIST_GET_FIRST(buf_pool->zip_free[i]); bpage; ) { - UNIV_MEM_VALID(bpage, BUF_BUDDY_LOW << i); ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_FREE); if (bpage == buddy) { -buddy_free: /* The buddy is free: recombine */ buf_buddy_remove_from_free(bpage, i); -buddy_free2: +buddy_is_free: ut_ad(buf_page_get_state(buddy) == BUF_BLOCK_ZIP_FREE); ut_ad(!buf_pool_contains_zip(buddy)); i++; @@ -682,122 +552,43 @@ buddy_free2: } ut_a(bpage != buf); - - { - buf_page_t* next = UT_LIST_GET_NEXT(zip_list, bpage); - UNIV_MEM_ASSERT_AND_FREE(bpage, BUF_BUDDY_LOW << i); - bpage = next; - } + UNIV_MEM_ASSERT_W(bpage, BUF_BUDDY_LOW << i); + bpage = UT_LIST_GET_NEXT(zip_list, bpage); } #ifndef UNIV_DEBUG_VALGRIND buddy_nonfree: - /* Valgrind would complain about accessing free memory. */ - ut_d(UT_LIST_VALIDATE(zip_list, buf_page_t, buf_pool->zip_free[i], - ut_ad(buf_page_get_state(ut_list_node_313) - == BUF_BLOCK_ZIP_FREE))); -#endif /* UNIV_DEBUG_VALGRIND */ +#endif /* !UNIV_DEBUG_VALGRIND */ + + ut_d(BUF_BUDDY_LIST_VALIDATE(i)); /* The buddy is not free. Is there a free block of this size? */ bpage = UT_LIST_GET_LAST(buf_pool->zip_free[i]); if (bpage) { + /* Remove the block from the free list, because a successful buf_buddy_relocate() will overwrite bpage->list. */ - - UNIV_MEM_VALID(bpage, BUF_BUDDY_LOW << i); buf_buddy_remove_from_free(bpage, i); /* Try to relocate the buddy of buf to the free block. */ if (buf_buddy_relocate(buddy, bpage, i, have_page_hash_mutex)) { - ut_d(buddy->state = BUF_BLOCK_ZIP_FREE); - goto buddy_free2; + buddy->state = BUF_BLOCK_ZIP_FREE; + goto buddy_is_free; } buf_buddy_add_to_free(bpage, i); - - /* Try to relocate the buddy of the free block to buf. */ - buddy = (buf_page_t*) buf_buddy_get(((byte*) bpage), - BUF_BUDDY_LOW << i); - -#ifndef UNIV_DEBUG_VALGRIND - /* Valgrind would complain about accessing free memory. */ - - /* The buddy must not be (completely) free, because we - always recombine adjacent free blocks. - - (Parts of the buddy can be free in - buf_pool->zip_free[j] with j < i.) */ - ut_d(UT_LIST_VALIDATE(zip_list, buf_page_t, buf_pool->zip_free[i], - ut_ad(buf_page_get_state( - ut_list_node_313) - == BUF_BLOCK_ZIP_FREE - && ut_list_node_313 != buddy))); -#endif /* !UNIV_DEBUG_VALGRIND */ - - if (buf_buddy_relocate(buddy, buf, i, have_page_hash_mutex)) { - - buf = bpage; - UNIV_MEM_VALID(bpage, BUF_BUDDY_LOW << i); - ut_d(buddy->state = BUF_BLOCK_ZIP_FREE); - goto buddy_free; - } } +func_exit: /* Free the block to the buddy list. */ bpage = buf; -#ifdef UNIV_DEBUG - if (i < buf_buddy_get_slot(PAGE_ZIP_MIN_SIZE)) { - /* This area has most likely been allocated for at - least one compressed-only block descriptor. Check - that there are no live objects in the area. This is - not a complete check: it may yield false positives as - well as false negatives. Also, due to buddy blocks - being recombined, it is possible (although unlikely) - that this branch is never reached. */ - - char* c; - -# ifndef UNIV_DEBUG_VALGRIND - /* Valgrind would complain about accessing - uninitialized memory. Besides, Valgrind performs a - more exhaustive check, at every memory access. */ - const buf_page_t* b = buf; - const buf_page_t* const b_end = (buf_page_t*) - ((char*) b + (BUF_BUDDY_LOW << i)); - - for (; b < b_end; b++) { - /* Avoid false positives (and cause false - negatives) by checking for b->space < 1000. */ - - if ((b->state == BUF_BLOCK_ZIP_PAGE - || b->state == BUF_BLOCK_ZIP_DIRTY) - && b->space > 0 && b->space < 1000) { - fprintf(stderr, - "buddy dirty %p %u (%u,%u) %p,%lu\n", - (void*) b, - b->state, b->space, b->offset, - buf, i); - } - } -# endif /* !UNIV_DEBUG_VALGRIND */ - - /* Scramble the block. This should make any pointers - invalid and trigger a segmentation violation. Because - the scrambling can be reversed, it may be possible to - track down the object pointing to the freed data by - dereferencing the unscrambled bpage->LRU or - bpage->list pointers. */ - for (c = (char*) buf + (BUF_BUDDY_LOW << i); - c-- > (char*) buf; ) { - *c = ~*c ^ i; - } - } else { - /* Fill large blocks with a constant pattern. */ - memset(bpage, i, BUF_BUDDY_LOW << i); - } -#endif /* UNIV_DEBUG */ + + /* Fill large blocks with a constant pattern. */ + ut_d(memset(bpage, i, BUF_BUDDY_LOW << i)); + UNIV_MEM_INVALID(bpage, BUF_BUDDY_LOW << i); + bpage->state = BUF_BLOCK_ZIP_FREE; buf_buddy_add_to_free(bpage, i); } diff --git a/storage/xtradb/buf/buf0buf.c b/storage/xtradb/buf/buf0buf.c index 5616455b289..c76973a42ef 100644 --- a/storage/xtradb/buf/buf0buf.c +++ b/storage/xtradb/buf/buf0buf.c @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved. +Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2008, Google Inc. Portions of this file contain modifications contributed and copyrighted by @@ -53,10 +53,6 @@ Created 11/5/1995 Heikki Tuuri #include "page0zip.h" #include "trx0trx.h" #include "srv0start.h" -#include "que0que.h" -#include "read0read.h" -#include "row0row.h" -#include "ha_prototypes.h" /* prototypes for new functions added to ha_innodb.cc */ trx_t* innobase_get_trx(); @@ -314,30 +310,6 @@ read-ahead or flush occurs */ UNIV_INTERN ibool buf_debug_prints = FALSE; #endif /* UNIV_DEBUG */ -/* Buffer pool shared memory segment information */ -typedef struct buf_shm_info_struct buf_shm_info_t; - -struct buf_shm_info_struct { - char head_str[8]; - ulint binary_id; - ibool is_new; /* during initializing */ - ibool clean; /* clean shutdowned and free */ - ibool reusable; /* reusable */ - ulint buf_pool_size; /* backup value */ - ulint page_size; /* backup value */ - ulint frame_offset; /* offset of the first frame based on chunk->mem */ - ulint zip_hash_offset; - ulint zip_hash_n; - - ulint checksum; - - buf_pool_t buf_pool_backup; - buf_chunk_t chunk_backup; - - ib_uint64_t dummy; -}; - -#define BUF_SHM_INFO_HEAD "XTRA_SHM" #endif /* !UNIV_HOTBACKUP */ /********************************************************************//** @@ -767,6 +739,10 @@ buf_block_init( block->page.in_flush_list = FALSE; block->page.in_free_list = FALSE; #endif /* UNIV_DEBUG */ + block->page.flush_list.prev = NULL; + block->page.flush_list.next = NULL; + block->page.zip_list.prev = NULL; + block->page.zip_list.next = NULL; block->page.in_LRU_list = FALSE; block->in_unzip_LRU_list = FALSE; #if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG @@ -784,45 +760,6 @@ buf_block_init( #endif /* UNIV_SYNC_DEBUG */ } -static -void -buf_block_reuse( -/*============*/ - buf_block_t* block, - ptrdiff_t frame_offset) -{ - /* block_init */ - block->frame += frame_offset; - - UNIV_MEM_DESC(block->frame, UNIV_PAGE_SIZE, block); - - block->index = NULL; - -#ifdef UNIV_DEBUG - /* recreate later */ - block->page.in_page_hash = FALSE; - block->page.in_zip_hash = FALSE; -#endif /* UNIV_DEBUG */ - -#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG - block->n_pointers = 0; -#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ - - if (block->page.zip.data) - block->page.zip.data += frame_offset; - - block->is_hashed = FALSE; - - mutex_create(&block->mutex, SYNC_BUF_BLOCK); - - rw_lock_create(&block->lock, SYNC_LEVEL_VARYING); - ut_ad(rw_lock_validate(&(block->lock))); - -#ifdef UNIV_SYNC_DEBUG - rw_lock_create(&block->debug_latch, SYNC_NO_ORDER_CHECK); -#endif /* UNIV_SYNC_DEBUG */ -} - /********************************************************************//** Allocates a chunk of buffer frames. @return chunk, or NULL on failure */ @@ -835,190 +772,28 @@ buf_chunk_init( { buf_block_t* block; byte* frame; - ulint zip_hash_n = 0; - ulint zip_hash_mem_size = 0; - hash_table_t* zip_hash_tmp = NULL; ulint i; ulint size_target; - buf_shm_info_t* shm_info = NULL; /* Round down to a multiple of page size, although it already should be. */ mem_size = ut_2pow_round(mem_size, UNIV_PAGE_SIZE); size_target = (mem_size / UNIV_PAGE_SIZE) - 1; - - srv_buffer_pool_shm_is_reused = FALSE; - - if (srv_buffer_pool_shm_key) { - /* zip_hash size */ - zip_hash_n = (mem_size / UNIV_PAGE_SIZE) * 2; - zip_hash_mem_size = ut_2pow_round(hash_create_needed(zip_hash_n) - + (UNIV_PAGE_SIZE - 1), UNIV_PAGE_SIZE); - } - /* Reserve space for the block descriptors. */ mem_size += ut_2pow_round((mem_size / UNIV_PAGE_SIZE) * (sizeof *block) + (UNIV_PAGE_SIZE - 1), UNIV_PAGE_SIZE); - if (srv_buffer_pool_shm_key) { - mem_size += ut_2pow_round(sizeof(buf_shm_info_t) - + (UNIV_PAGE_SIZE - 1), UNIV_PAGE_SIZE); - mem_size += zip_hash_mem_size; - } chunk->mem_size = mem_size; - - if (srv_buffer_pool_shm_key) { - ulint binary_id; - ibool is_new; - - ut_a(buf_pool->n_chunks == 1); - - fprintf(stderr, - "InnoDB: Warning: The innodb_buffer_pool_shm_key option has been specified.\n" - "InnoDB: Do not change the following between restarts of the server while this option is being used:\n" - "InnoDB: * the mysqld executable between restarts of the server.\n" - "InnoDB: * the value of innodb_buffer_pool_size.\n" - "InnoDB: * the value of innodb_page_size.\n" - "InnoDB: * datafiles created by InnoDB during this session.\n" - "InnoDB: Otherwise, data corruption in datafiles may result.\n"); - - /* FIXME: This is vague id still */ - binary_id = (ulint) ((byte*)mtr_commit - (byte*)btr_root_get) - + (ulint) ((byte*)os_get_os_version - (byte*)buf_calc_page_new_checksum) - + (ulint) ((byte*)page_dir_find_owner_slot - (byte*)dfield_data_is_binary_equal) - + (ulint) ((byte*)que_graph_publish - (byte*)dict_casedn_str) - + (ulint) ((byte*)read_view_oldest_copy_or_open_new - (byte*)fil_space_get_version) - + (ulint) ((byte*)rec_get_n_extern_new - (byte*)fsp_get_size_low) - + (ulint) ((byte*)row_get_trx_id_offset - (byte*)ha_create_func) - + (ulint) ((byte*)srv_set_io_thread_op_info - (byte*)thd_is_replication_slave_thread) - + (ulint) ((byte*)mutex_create_func - (byte*)ibuf_inside) - + (ulint) ((byte*)trx_set_detailed_error - (byte*)lock_check_trx_id_sanity) - + (ulint) ((byte*)ut_time - (byte*)mem_heap_strdup); - - chunk->mem = os_shm_alloc(&chunk->mem_size, srv_buffer_pool_shm_key, &is_new); - - if (UNIV_UNLIKELY(chunk->mem == NULL)) { - return(NULL); - } -init_again: -#ifdef UNIV_SET_MEM_TO_ZERO - if (is_new) { - memset(chunk->mem, '\0', chunk->mem_size); - } -#endif - /* for ut_fold_binary_32(), these values should be 32-bit aligned */ - ut_a(sizeof(buf_shm_info_t) % 4 == 0); - ut_a((ulint)chunk->mem % 4 == 0); - ut_a(chunk->mem_size % 4 == 0); - - shm_info = chunk->mem; - - zip_hash_tmp = (hash_table_t*)((byte*)chunk->mem + chunk->mem_size - zip_hash_mem_size); - - if (is_new) { - strncpy(shm_info->head_str, BUF_SHM_INFO_HEAD, 8); - shm_info->binary_id = binary_id; - shm_info->is_new = TRUE; /* changed to FALSE when the initialization is finished */ - shm_info->clean = FALSE; /* changed to TRUE when free the segment. */ - shm_info->reusable = FALSE; /* changed to TRUE when validation is finished. */ - shm_info->buf_pool_size = srv_buf_pool_size; - shm_info->page_size = srv_page_size; - shm_info->zip_hash_offset = chunk->mem_size - zip_hash_mem_size; - shm_info->zip_hash_n = zip_hash_n; - } else { - ulint checksum; - - if (strncmp(shm_info->head_str, BUF_SHM_INFO_HEAD, 8)) { - fprintf(stderr, - "InnoDB: Error: The shared memory segment seems not to be for buffer pool.\n"); - return(NULL); - } - if (shm_info->binary_id != binary_id) { - fprintf(stderr, - "InnoDB: Error: The shared memory segment seems not to be for this binary.\n"); - return(NULL); - } - if (shm_info->is_new) { - fprintf(stderr, - "InnoDB: Error: The shared memory was not initialized yet.\n"); - return(NULL); - } - if (shm_info->buf_pool_size != srv_buf_pool_size) { - fprintf(stderr, - "InnoDB: Error: srv_buf_pool_size is different (shm=%lu current=%lu).\n", - shm_info->buf_pool_size, srv_buf_pool_size); - return(NULL); - } - if (shm_info->page_size != srv_page_size) { - fprintf(stderr, - "InnoDB: Error: srv_page_size is different (shm=%lu current=%lu).\n", - shm_info->page_size, srv_page_size); - return(NULL); - } - if (!shm_info->reusable) { - fprintf(stderr, - "InnoDB: Warning: The shared memory has unrecoverable contents.\n" - "InnoDB: The shared memory segment is initialized.\n"); - is_new = TRUE; - goto init_again; - } - if (!shm_info->clean) { - fprintf(stderr, - "InnoDB: Warning: The shared memory was not shut down cleanly.\n" - "InnoDB: The shared memory segment is initialized.\n"); - is_new = TRUE; - goto init_again; - } - - ut_a(shm_info->zip_hash_offset == chunk->mem_size - zip_hash_mem_size); - ut_a(shm_info->zip_hash_n == zip_hash_n); - - /* check checksum */ - if (srv_buffer_pool_shm_checksum) { - checksum = ut_fold_binary_32((byte*)chunk->mem + sizeof(buf_shm_info_t), - chunk->mem_size - sizeof(buf_shm_info_t)); - } else { - checksum = BUF_NO_CHECKSUM_MAGIC; - } - - if (shm_info->checksum != BUF_NO_CHECKSUM_MAGIC - && shm_info->checksum != checksum) { - fprintf(stderr, - "InnoDB: Error: checksum of the shared memory is not match. " - "(stored=%lu calculated=%lu)\n", - shm_info->checksum, checksum); - return(NULL); - } - - /* flag to use the segment. */ - shm_info->clean = FALSE; /* changed to TRUE when free the segment. */ - } - - /* init zip_hash contents */ - if (is_new) { - hash_create_init(zip_hash_tmp, zip_hash_n); - } else { - /* adjust offset is done later */ - hash_create_reuse(zip_hash_tmp); - - srv_buffer_pool_shm_is_reused = TRUE; - } - } else { chunk->mem = os_mem_alloc_large(&chunk->mem_size); if (UNIV_UNLIKELY(chunk->mem == NULL)) { return(NULL); } - } /* Allocate the block descriptors from the start of the memory block. */ - if (srv_buffer_pool_shm_key) { - chunk->blocks = (buf_block_t*)((byte*)chunk->mem + sizeof(buf_shm_info_t)); - } else { chunk->blocks = chunk->mem; - } /* Align a pointer to the first frame. Note that when os_large_page_size is smaller than UNIV_PAGE_SIZE, @@ -1026,13 +801,8 @@ init_again: it is bigger, we may allocate more blocks than requested. */ frame = ut_align(chunk->mem, UNIV_PAGE_SIZE); - if (srv_buffer_pool_shm_key) { - /* reserve zip_hash space and always -1 for reproductibity */ - chunk->size = (chunk->mem_size - zip_hash_mem_size) / UNIV_PAGE_SIZE - 1; - } else { chunk->size = chunk->mem_size / UNIV_PAGE_SIZE - (frame != chunk->mem); - } /* Subtract the space needed for block descriptors. */ { @@ -1050,99 +820,6 @@ init_again: chunk->size = size_target; } - if (shm_info && !(shm_info->is_new)) { - /* convert the shared memory segment for reuse */ - ptrdiff_t phys_offset; - ptrdiff_t logi_offset; - ptrdiff_t blocks_offset; - void* previous_frame_address; - - if (chunk->size < shm_info->chunk_backup.size) { - fprintf(stderr, - "InnoDB: Error: The buffer pool became smaller because of allocated address.\n" - "InnoDB: Retrying may avoid this situation.\n"); - shm_info->clean = TRUE; /* release the flag for retrying */ - return(NULL); - } - - chunk->size = shm_info->chunk_backup.size; - phys_offset = frame - ((byte*)chunk->mem + shm_info->frame_offset); - logi_offset = frame - chunk->blocks[0].frame; - previous_frame_address = chunk->blocks[0].frame; - blocks_offset = (byte*)chunk->blocks - (byte*)shm_info->chunk_backup.blocks; - - if (phys_offset || logi_offset || blocks_offset) { - fprintf(stderr, - "InnoDB: Buffer pool in the shared memory segment should be converted.\n" - "InnoDB: Previous frames in address : %p\n" - "InnoDB: Previous frames were located : %p\n" - "InnoDB: Current frames should be located: %p\n" - "InnoDB: Pysical offset : %ld (%#lx)\n" - "InnoDB: Logical offset (frames) : %ld (%#lx)\n" - "InnoDB: Logical offset (blocks) : %ld (%#lx)\n", - (byte*)chunk->mem + shm_info->frame_offset, - chunk->blocks[0].frame, frame, - (long) phys_offset, (long) phys_offset, - (long) logi_offset, (long) logi_offset, - (long) blocks_offset, (long) blocks_offset); - } else { - fprintf(stderr, - "InnoDB: Buffer pool in the shared memory segment can be used as it is.\n"); - } - - if (phys_offset) { - fprintf(stderr, - "InnoDB: Aligning physical offset..."); - - memmove(frame, (byte*)chunk->mem + shm_info->frame_offset, - chunk->size * UNIV_PAGE_SIZE); - - fprintf(stderr, - " Done.\n"); - } - - /* buf_block_t */ - block = chunk->blocks; - for (i = chunk->size; i--; ) { - buf_block_reuse(block, logi_offset); - block++; - } - - if (logi_offset || blocks_offset) { - fprintf(stderr, - "InnoDB: Aligning logical offset..."); - - - /* buf_pool_t buf_pool_backup */ - UT_LIST_OFFSET(flush_list, buf_page_t, shm_info->buf_pool_backup.flush_list, - previous_frame_address, logi_offset, blocks_offset); - UT_LIST_OFFSET(free, buf_page_t, shm_info->buf_pool_backup.free, - previous_frame_address, logi_offset, blocks_offset); - UT_LIST_OFFSET(LRU, buf_page_t, shm_info->buf_pool_backup.LRU, - previous_frame_address, logi_offset, blocks_offset); - if (shm_info->buf_pool_backup.LRU_old) - shm_info->buf_pool_backup.LRU_old = - (buf_page_t*)((byte*)(shm_info->buf_pool_backup.LRU_old) - + (((void*)shm_info->buf_pool_backup.LRU_old > previous_frame_address) - ? logi_offset : blocks_offset)); - - UT_LIST_OFFSET(unzip_LRU, buf_block_t, shm_info->buf_pool_backup.unzip_LRU, - previous_frame_address, logi_offset, blocks_offset); - - UT_LIST_OFFSET(zip_list, buf_page_t, shm_info->buf_pool_backup.zip_clean, - previous_frame_address, logi_offset, blocks_offset); - for (i = 0; i < BUF_BUDDY_SIZES_MAX; i++) { - UT_LIST_OFFSET(zip_list, buf_page_t, shm_info->buf_pool_backup.zip_free[i], - previous_frame_address, logi_offset, blocks_offset); - } - - HASH_OFFSET(zip_hash_tmp, buf_page_t, hash, - previous_frame_address, logi_offset, blocks_offset); - - fprintf(stderr, - " Done.\n"); - } - } else { /* Init block structs and assign frames for them. Then we assign the frames to the first blocks (we already mapped the memory above). */ @@ -1166,11 +843,6 @@ init_again: block++; frame += UNIV_PAGE_SIZE; } - } - - if (shm_info) { - shm_info->frame_offset = chunk->blocks[0].frame - (byte*)chunk->mem; - } return(chunk); } @@ -1287,76 +959,6 @@ buf_chunk_not_freed( return(NULL); } -/*********************************************************************//** -Checks that all blocks in the buffer chunk are in BUF_BLOCK_NOT_USED state. -@return TRUE if all freed */ -static -ibool -buf_chunk_all_free( -/*===============*/ - const buf_chunk_t* chunk) /*!< in: chunk being checked */ -{ - const buf_block_t* block; - ulint i; - - ut_ad(buf_pool); - ut_ad(buf_pool_mutex_own()); /* but we need all mutex here */ - - block = chunk->blocks; - - for (i = chunk->size; i--; block++) { - - if (buf_block_get_state(block) != BUF_BLOCK_NOT_USED) { - - return(FALSE); - } - } - - return(TRUE); -} - -/********************************************************************//** -Frees a chunk of buffer frames. */ -static -void -buf_chunk_free( -/*===========*/ - buf_chunk_t* chunk) /*!< out: chunk of buffers */ -{ - buf_block_t* block; - const buf_block_t* block_end; - - ut_ad(buf_pool_mutex_own()); /* but we need all mutex here */ - - block_end = chunk->blocks + chunk->size; - - for (block = chunk->blocks; block < block_end; block++) { - ut_a(buf_block_get_state(block) == BUF_BLOCK_NOT_USED); - ut_a(!block->page.zip.data); - - ut_ad(!block->page.in_LRU_list); - ut_ad(!block->in_unzip_LRU_list); - ut_ad(!block->page.in_flush_list); - /* Remove the block from the free list. */ - mutex_enter(&free_list_mutex); - ut_ad(block->page.in_free_list); - UT_LIST_REMOVE(free, buf_pool->free, (&block->page)); - mutex_exit(&free_list_mutex); - - /* Free the latches. */ - mutex_free(&block->mutex); - rw_lock_free(&block->lock); -#ifdef UNIV_SYNC_DEBUG - rw_lock_free(&block->debug_latch); -#endif /* UNIV_SYNC_DEBUG */ - UNIV_MEM_UNDESC(block); - } - - ut_a(!srv_buffer_pool_shm_key); - - os_mem_free_large(chunk->mem, chunk->mem_size); -} - /********************************************************************//** Creates the buffer pool. @return own: buf_pool object, NULL if not enough memory or error */ @@ -1403,10 +1005,7 @@ buf_pool_init(void) srv_buf_pool_curr_size = buf_pool->curr_size * UNIV_PAGE_SIZE; buf_pool->page_hash = hash_create(2 * buf_pool->curr_size); - /* zip_hash is allocated to shm when srv_buffer_pool_shm_key is enabled */ - if (!srv_buffer_pool_shm_key) { buf_pool->zip_hash = hash_create(2 * buf_pool->curr_size); - } buf_pool->last_printout_time = time(NULL); @@ -1421,86 +1020,6 @@ buf_pool_init(void) --------------------------- */ /* All fields are initialized by mem_zalloc(). */ - if (srv_buffer_pool_shm_key) { - buf_shm_info_t* shm_info; - - ut_a((byte*)chunk->blocks == (byte*)chunk->mem + sizeof(buf_shm_info_t)); - shm_info = chunk->mem; - - buf_pool->zip_hash = (hash_table_t*)((byte*)chunk->mem + shm_info->zip_hash_offset); - - if(shm_info->is_new) { - shm_info->is_new = FALSE; /* initialization was finished */ - } else { - buf_block_t* block = chunk->blocks; - buf_page_t* b; - - /* shm_info->buf_pool_backup should be converted */ - /* at buf_chunk_init(). So copy simply. */ - buf_pool->flush_list = shm_info->buf_pool_backup.flush_list; - buf_pool->freed_page_clock = shm_info->buf_pool_backup.freed_page_clock; - buf_pool->free = shm_info->buf_pool_backup.free; - buf_pool->LRU = shm_info->buf_pool_backup.LRU; - buf_pool->LRU_old = shm_info->buf_pool_backup.LRU_old; - buf_pool->LRU_old_len = shm_info->buf_pool_backup.LRU_old_len; - buf_pool->unzip_LRU = shm_info->buf_pool_backup.unzip_LRU; - buf_pool->zip_clean = shm_info->buf_pool_backup.zip_clean; - for (i = 0; i < BUF_BUDDY_SIZES_MAX; i++) { - buf_pool->zip_free[i] = shm_info->buf_pool_backup.zip_free[i]; - } - - for (i = 0; i < chunk->size; i++, block++) { - if (buf_block_get_state(block) - == BUF_BLOCK_FILE_PAGE) { - ut_d(block->page.in_page_hash = TRUE); - HASH_INSERT(buf_page_t, hash, buf_pool->page_hash, - buf_page_address_fold( - block->page.space, - block->page.offset), - &block->page); - } - } - - for (b = UT_LIST_GET_FIRST(buf_pool->zip_clean); b; - b = UT_LIST_GET_NEXT(zip_list, b)) { - ut_ad(!b->in_flush_list); - ut_ad(b->in_LRU_list); - - ut_d(b->in_page_hash = TRUE); - HASH_INSERT(buf_page_t, hash, buf_pool->page_hash, - buf_page_address_fold(b->space, b->offset), b); - } - - for (b = UT_LIST_GET_FIRST(buf_pool->flush_list); b; - b = UT_LIST_GET_NEXT(flush_list, b)) { - ut_ad(b->in_flush_list); - ut_ad(b->in_LRU_list); - - switch (buf_page_get_state(b)) { - case BUF_BLOCK_ZIP_DIRTY: - ut_d(b->in_page_hash = TRUE); - HASH_INSERT(buf_page_t, hash, buf_pool->page_hash, - buf_page_address_fold(b->space, - b->offset), b); - break; - case BUF_BLOCK_FILE_PAGE: - /* uncompressed page */ - break; - case BUF_BLOCK_ZIP_FREE: - case BUF_BLOCK_ZIP_PAGE: - case BUF_BLOCK_NOT_USED: - case BUF_BLOCK_READY_FOR_USE: - case BUF_BLOCK_MEMORY: - case BUF_BLOCK_REMOVE_HASH: - ut_error; - break; - } - } - - - } - } - mutex_exit(&LRU_list_mutex); rw_lock_x_unlock(&page_hash_latch); buf_pool_mutex_exit(); @@ -1525,49 +1044,16 @@ buf_pool_free(void) buf_chunk_t* chunk; buf_chunk_t* chunks; - if (srv_buffer_pool_shm_key) { - buf_shm_info_t* shm_info; - - ut_a(buf_pool->n_chunks == 1); - - chunk = buf_pool->chunks; - shm_info = chunk->mem; - ut_a((byte*)chunk->blocks == (byte*)chunk->mem + sizeof(buf_shm_info_t)); - - /* validation the shared memory segment doesn't have unrecoverable contents. */ - /* Currently, validation became not needed */ - shm_info->reusable = TRUE; - - memcpy(&(shm_info->buf_pool_backup), buf_pool, sizeof(buf_pool_t)); - memcpy(&(shm_info->chunk_backup), chunk, sizeof(buf_chunk_t)); - - if (srv_fast_shutdown < 2) { - if (srv_buffer_pool_shm_checksum) { - shm_info->checksum = ut_fold_binary_32((byte*)chunk->mem + sizeof(buf_shm_info_t), - chunk->mem_size - sizeof(buf_shm_info_t)); - } else { - shm_info->checksum = BUF_NO_CHECKSUM_MAGIC; - } - shm_info->clean = TRUE; - } - - os_shm_free(chunk->mem, chunk->mem_size); - } else { chunks = buf_pool->chunks; chunk = chunks + buf_pool->n_chunks; while (--chunk >= chunks) { - /* Bypass the checks of buf_chunk_free(), since they - would fail at shutdown. */ os_mem_free_large(chunk->mem, chunk->mem_size); } - } mem_free(buf_pool->chunks); hash_table_free(buf_pool->page_hash); - if (!srv_buffer_pool_shm_key) { hash_table_free(buf_pool->zip_hash); - } mem_free(buf_pool); buf_pool = NULL; } @@ -1741,335 +1227,6 @@ buf_relocate( } /********************************************************************//** -Shrinks the buffer pool. */ -static -void -buf_pool_shrink( -/*============*/ - ulint chunk_size) /*!< in: number of pages to remove */ -{ - buf_chunk_t* chunks; - buf_chunk_t* chunk; - ulint max_size; - ulint max_free_size; - buf_chunk_t* max_chunk; - buf_chunk_t* max_free_chunk; - - ut_ad(!buf_pool_mutex_own()); - -try_again: - btr_search_disable(); /* Empty the adaptive hash index again */ - //buf_pool_mutex_enter(); - mutex_enter(&LRU_list_mutex); - - if (srv_buffer_pool_shm_key) { - /* Cannot support shrink */ - goto func_done; - } - -shrink_again: - if (buf_pool->n_chunks <= 1) { - - /* Cannot shrink if there is only one chunk */ - goto func_done; - } - - /* Search for the largest free chunk - not larger than the size difference */ - chunks = buf_pool->chunks; - chunk = chunks + buf_pool->n_chunks; - max_size = max_free_size = 0; - max_chunk = max_free_chunk = NULL; - - while (--chunk >= chunks) { - if (chunk->size <= chunk_size - && chunk->size > max_free_size) { - if (chunk->size > max_size) { - max_size = chunk->size; - max_chunk = chunk; - } - - if (buf_chunk_all_free(chunk)) { - max_free_size = chunk->size; - max_free_chunk = chunk; - } - } - } - - if (!max_free_size) { - - ulint dirty = 0; - ulint nonfree = 0; - buf_block_t* block; - buf_block_t* bend; - - /* Cannot shrink: try again later - (do not assign srv_buf_pool_old_size) */ - if (!max_chunk) { - - goto func_exit; - } - - block = max_chunk->blocks; - bend = block + max_chunk->size; - - /* Move the blocks of chunk to the end of the - LRU list and try to flush them. */ - for (; block < bend; block++) { - switch (buf_block_get_state(block)) { - case BUF_BLOCK_NOT_USED: - continue; - case BUF_BLOCK_FILE_PAGE: - break; - default: - nonfree++; - continue; - } - - mutex_enter(&block->mutex); - /* The following calls will temporarily - release block->mutex and buf_pool_mutex. - Therefore, we have to always retry, - even if !dirty && !nonfree. */ - - if (!buf_flush_ready_for_replace(&block->page)) { - - buf_LRU_make_block_old(&block->page); - dirty++; - } else if (buf_LRU_free_block(&block->page, TRUE, FALSE) - != BUF_LRU_FREED) { - nonfree++; - } - - mutex_exit(&block->mutex); - } - - //buf_pool_mutex_exit(); - mutex_exit(&LRU_list_mutex); - - /* Request for a flush of the chunk if it helps. - Do not flush if there are non-free blocks, since - flushing will not make the chunk freeable. */ - if (nonfree) { - /* Avoid busy-waiting. */ - os_thread_sleep(100000); - } else if (dirty - && buf_flush_batch(BUF_FLUSH_LRU, dirty, 0) - == ULINT_UNDEFINED) { - - buf_flush_wait_batch_end(BUF_FLUSH_LRU); - } - - goto try_again; - } - - max_size = max_free_size; - max_chunk = max_free_chunk; - - srv_buf_pool_old_size = srv_buf_pool_size; - - /* Rewrite buf_pool->chunks. Copy everything but max_chunk. */ - chunks = mem_alloc((buf_pool->n_chunks - 1) * sizeof *chunks); - memcpy(chunks, buf_pool->chunks, - (max_chunk - buf_pool->chunks) * sizeof *chunks); - memcpy(chunks + (max_chunk - buf_pool->chunks), - max_chunk + 1, - buf_pool->chunks + buf_pool->n_chunks - - (max_chunk + 1)); - ut_a(buf_pool->curr_size > max_chunk->size); - buf_pool->curr_size -= max_chunk->size; - srv_buf_pool_curr_size = buf_pool->curr_size * UNIV_PAGE_SIZE; - chunk_size -= max_chunk->size; - buf_chunk_free(max_chunk); - mem_free(buf_pool->chunks); - buf_pool->chunks = chunks; - buf_pool->n_chunks--; - - /* Allow a slack of one megabyte. */ - if (chunk_size > 1048576 / UNIV_PAGE_SIZE) { - - goto shrink_again; - } - -func_done: - srv_buf_pool_old_size = srv_buf_pool_size; -func_exit: - //buf_pool_mutex_exit(); - mutex_exit(&LRU_list_mutex); - btr_search_enable(); -} - -/********************************************************************//** -Rebuild buf_pool->page_hash. */ -static -void -buf_pool_page_hash_rebuild(void) -/*============================*/ -{ - ulint i; - ulint n_chunks; - buf_chunk_t* chunk; - hash_table_t* page_hash; - hash_table_t* zip_hash; - buf_page_t* b; - - //buf_pool_mutex_enter(); - mutex_enter(&LRU_list_mutex); - rw_lock_x_lock(&page_hash_latch); - mutex_enter(&flush_list_mutex); - - - /* Free, create, and populate the hash table. */ - hash_table_free(buf_pool->page_hash); - buf_pool->page_hash = page_hash = hash_create(2 * buf_pool->curr_size); - zip_hash = hash_create(2 * buf_pool->curr_size); - - HASH_MIGRATE(buf_pool->zip_hash, zip_hash, buf_page_t, hash, - BUF_POOL_ZIP_FOLD_BPAGE); - - hash_table_free(buf_pool->zip_hash); - buf_pool->zip_hash = zip_hash; - - /* Insert the uncompressed file pages to buf_pool->page_hash. */ - - chunk = buf_pool->chunks; - n_chunks = buf_pool->n_chunks; - - for (i = 0; i < n_chunks; i++, chunk++) { - ulint j; - buf_block_t* block = chunk->blocks; - - for (j = 0; j < chunk->size; j++, block++) { - if (buf_block_get_state(block) - == BUF_BLOCK_FILE_PAGE) { - ut_ad(!block->page.in_zip_hash); - ut_ad(block->page.in_page_hash); - - HASH_INSERT(buf_page_t, hash, page_hash, - buf_page_address_fold( - block->page.space, - block->page.offset), - &block->page); - } - } - } - - /* Insert the compressed-only pages to buf_pool->page_hash. - All such blocks are either in buf_pool->zip_clean or - in buf_pool->flush_list. */ - - for (b = UT_LIST_GET_FIRST(buf_pool->zip_clean); b; - b = UT_LIST_GET_NEXT(zip_list, b)) { - ut_a(buf_page_get_state(b) == BUF_BLOCK_ZIP_PAGE); - ut_ad(!b->in_flush_list); - ut_ad(b->in_LRU_list); - ut_ad(b->in_page_hash); - ut_ad(!b->in_zip_hash); - - HASH_INSERT(buf_page_t, hash, page_hash, - buf_page_address_fold(b->space, b->offset), b); - } - - for (b = UT_LIST_GET_FIRST(buf_pool->flush_list); b; - b = UT_LIST_GET_NEXT(flush_list, b)) { - ut_ad(b->in_flush_list); - ut_ad(b->in_LRU_list); - ut_ad(b->in_page_hash); - ut_ad(!b->in_zip_hash); - - switch (buf_page_get_state(b)) { - case BUF_BLOCK_ZIP_DIRTY: - HASH_INSERT(buf_page_t, hash, page_hash, - buf_page_address_fold(b->space, - b->offset), b); - break; - case BUF_BLOCK_FILE_PAGE: - /* uncompressed page */ - break; - case BUF_BLOCK_ZIP_FREE: - case BUF_BLOCK_ZIP_PAGE: - case BUF_BLOCK_NOT_USED: - case BUF_BLOCK_READY_FOR_USE: - case BUF_BLOCK_MEMORY: - case BUF_BLOCK_REMOVE_HASH: - ut_error; - break; - } - } - - //buf_pool_mutex_exit(); - mutex_exit(&LRU_list_mutex); - rw_lock_x_unlock(&page_hash_latch); - mutex_exit(&flush_list_mutex); -} - -/********************************************************************//** -Resizes the buffer pool. */ -UNIV_INTERN -void -buf_pool_resize(void) -/*=================*/ -{ - if (srv_buffer_pool_shm_key) { - /* Cannot support resize */ - return; - } - - //buf_pool_mutex_enter(); - mutex_enter(&LRU_list_mutex); - - if (srv_buf_pool_old_size == srv_buf_pool_size) { - - //buf_pool_mutex_exit(); - mutex_exit(&LRU_list_mutex); - return; - } - - if (srv_buf_pool_curr_size + 1048576 > srv_buf_pool_size) { - - //buf_pool_mutex_exit(); - mutex_exit(&LRU_list_mutex); - - /* Disable adaptive hash indexes and empty the index - in order to free up memory in the buffer pool chunks. */ - buf_pool_shrink((srv_buf_pool_curr_size - srv_buf_pool_size) - / UNIV_PAGE_SIZE); - } else if (srv_buf_pool_curr_size + 1048576 < srv_buf_pool_size) { - - /* Enlarge the buffer pool by at least one megabyte */ - - ulint mem_size - = srv_buf_pool_size - srv_buf_pool_curr_size; - buf_chunk_t* chunks; - buf_chunk_t* chunk; - - chunks = mem_alloc((buf_pool->n_chunks + 1) * sizeof *chunks); - - memcpy(chunks, buf_pool->chunks, buf_pool->n_chunks - * sizeof *chunks); - - chunk = &chunks[buf_pool->n_chunks]; - - if (!buf_chunk_init(chunk, mem_size)) { - mem_free(chunks); - } else { - buf_pool->curr_size += chunk->size; - srv_buf_pool_curr_size = buf_pool->curr_size - * UNIV_PAGE_SIZE; - mem_free(buf_pool->chunks); - buf_pool->chunks = chunks; - buf_pool->n_chunks++; - } - - srv_buf_pool_old_size = srv_buf_pool_size; - //buf_pool_mutex_exit(); - mutex_exit(&LRU_list_mutex); - } - - buf_pool_page_hash_rebuild(); -} - -/********************************************************************//** Moves a page to the start of the buffer pool LRU list. This high-level function can be used to prevent an important page from slipping out of the buffer pool. */ @@ -2303,6 +1460,27 @@ lookup: #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ } + if (UNIV_UNLIKELY(bpage->space_was_being_deleted)) { + /* This page is obsoleted, should discard and retry */ + rw_lock_s_unlock(&page_hash_latch); + + mutex_enter(&LRU_list_mutex); + block_mutex = buf_page_get_mutex_enter(bpage); + + if (UNIV_UNLIKELY(!block_mutex)) { + mutex_exit(&LRU_list_mutex); + goto lookup; + } + + buf_LRU_free_block(bpage, TRUE, TRUE); + + mutex_exit(&LRU_list_mutex); + mutex_exit(block_mutex); + block_mutex = NULL; + + goto lookup; + } + if (UNIV_UNLIKELY(!bpage->zip.data)) { /* There is no compressed page. */ err_exit: @@ -2311,13 +1489,12 @@ err_exit: return(NULL); } - if (srv_pass_corrupt_table) { + if (srv_pass_corrupt_table <= 1) { if (bpage->is_corrupt) { rw_lock_s_unlock(&page_hash_latch); return(NULL); } } - ut_a(!(bpage->is_corrupt)); block_mutex = buf_page_get_mutex_enter(bpage); @@ -2340,13 +1517,32 @@ err_exit: case BUF_BLOCK_FILE_PAGE: ut_a(block_mutex == &((buf_block_t*) bpage)->mutex); - /* Discard the uncompressed page frame if possible. */ - if (buf_LRU_free_block(bpage, FALSE, FALSE) == BUF_LRU_FREED) { + /* release mutex to obey to latch-order */ + mutex_exit(block_mutex); + + /* get LRU_list_mutex for buf_LRU_free_block() */ + mutex_enter(&LRU_list_mutex); + mutex_enter(block_mutex); + if (UNIV_UNLIKELY(bpage->space != space + || bpage->offset != offset + || !bpage->in_LRU_list + || !bpage->zip.data)) { + /* someone should interrupt, retry */ + mutex_exit(&LRU_list_mutex); + mutex_exit(block_mutex); + goto lookup; + } + + /* Discard the uncompressed page frame if possible. */ + if (buf_LRU_free_block(bpage, FALSE, TRUE)) { + mutex_exit(&LRU_list_mutex); mutex_exit(block_mutex); goto lookup; } + mutex_exit(&LRU_list_mutex); + buf_block_buf_fix_inc((buf_block_t*) bpage, __FILE__, __LINE__); goto got_block; @@ -2516,16 +1712,19 @@ buf_block_align( /* TODO: protect buf_pool->chunks with a mutex (it will currently remain constant after buf_pool_init()) */ for (chunk = buf_pool->chunks, i = buf_pool->n_chunks; i--; chunk++) { - lint offs = ptr - chunk->blocks->frame; + ulint offs; - if (UNIV_UNLIKELY(offs < 0)) { + if (UNIV_UNLIKELY(ptr < chunk->blocks->frame)) { continue; } + /* else */ + + offs = ptr - chunk->blocks->frame; offs >>= UNIV_PAGE_SIZE_SHIFT; - if (UNIV_LIKELY((ulint) offs < chunk->size)) { + if (UNIV_LIKELY(offs < chunk->size)) { buf_block_t* block = &chunk->blocks[offs]; /* The function buf_chunk_init() invokes @@ -2651,7 +1850,7 @@ buf_page_get_gen( ulint rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH */ buf_block_t* guess, /*!< in: guessed block or NULL */ ulint mode, /*!< in: BUF_GET, BUF_GET_IF_IN_POOL, - BUF_GET_NO_LATCH */ + BUF_PEEK_IF_IN_POOL, BUF_GET_NO_LATCH */ const char* file, /*!< in: file name */ ulint line, /*!< in: line where called */ mtr_t* mtr) /*!< in: mini-transaction */ @@ -2661,7 +1860,7 @@ buf_page_get_gen( ulint fix_type; ibool must_read; ulint retries = 0; - mutex_t* block_mutex= 0; + mutex_t* block_mutex = NULL; trx_t* trx = NULL; ulint sec; ulint ms; @@ -2673,9 +1872,19 @@ buf_page_get_gen( ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH) || (rw_latch == RW_NO_LATCH)); - ut_ad((mode != BUF_GET_NO_LATCH) || (rw_latch == RW_NO_LATCH)); - ut_ad((mode == BUF_GET) || (mode == BUF_GET_IF_IN_POOL) - || (mode == BUF_GET_NO_LATCH)); +#ifdef UNIV_DEBUG + switch (mode) { + case BUF_GET_NO_LATCH: + ut_ad(rw_latch == RW_NO_LATCH); + break; + case BUF_GET: + case BUF_GET_IF_IN_POOL: + case BUF_PEEK_IF_IN_POOL: + break; + default: + ut_error; + } +#endif /* UNIV_DEBUG */ ut_ad(zip_size == fil_space_get_zip_size(space)); ut_ad(ut_is_2pow(zip_size)); #ifndef UNIV_LOG_DEBUG @@ -2693,13 +1902,8 @@ loop: block_mutex = buf_page_get_mutex_enter((buf_page_t*)block); /* If the guess is a compressed page descriptor that - has been allocated by buf_buddy_alloc(), it may have - been invalidated by buf_buddy_relocate(). In that - case, block could point to something that happens to - contain the expected bits in block->page. Similarly, - the guess may be pointing to a buffer pool chunk that - has been released when resizing the buffer pool. */ - + has been allocated by buf_page_alloc_descriptor(), + it may have been freed by buf_relocate(). */ if (!block_mutex) { block = guess = NULL; } else if (!buf_block_is_uncompressed(block) @@ -2720,6 +1924,27 @@ loop: rw_lock_s_lock(&page_hash_latch); block = (buf_block_t*) buf_page_hash_get(space, offset); if (block) { + if (UNIV_UNLIKELY(block->page.space_was_being_deleted)) { + /* This page is obsoleted, should discard and retry */ + rw_lock_s_unlock(&page_hash_latch); + + mutex_enter(&LRU_list_mutex); + block_mutex = buf_page_get_mutex_enter((buf_page_t*)block); + + if (UNIV_UNLIKELY(!block_mutex)) { + mutex_exit(&LRU_list_mutex); + goto loop; + } + + buf_LRU_free_block((buf_page_t*)block, TRUE, TRUE); + + mutex_exit(&LRU_list_mutex); + mutex_exit(block_mutex); + block_mutex = NULL; + + goto loop; + } + block_mutex = buf_page_get_mutex_enter((buf_page_t*)block); ut_a(block_mutex); } @@ -2732,7 +1957,8 @@ loop2: //buf_pool_mutex_exit(); - if (mode == BUF_GET_IF_IN_POOL) { + if (mode == BUF_GET_IF_IN_POOL + || mode == BUF_PEEK_IF_IN_POOL) { return(NULL); } @@ -2771,7 +1997,8 @@ loop2: must_read = buf_block_get_io_fix(block) == BUF_IO_READ; - if (must_read && mode == BUF_GET_IF_IN_POOL) { + if (must_read && (mode == BUF_GET_IF_IN_POOL + || mode == BUF_PEEK_IF_IN_POOL)) { /* The page is only being read to buffer */ //buf_pool_mutex_exit(); mutex_exit(block_mutex); @@ -2779,13 +2006,12 @@ loop2: return(NULL); } - if (srv_pass_corrupt_table) { + if (srv_pass_corrupt_table <= 1) { if (block->page.is_corrupt) { mutex_exit(block_mutex); return(NULL); } } - ut_a(!(block->page.is_corrupt)); switch (buf_block_get_state(block)) { buf_page_t* bpage; @@ -2897,8 +2123,10 @@ wait_until_unfixed: if (buf_page_get_state(&block->page) == BUF_BLOCK_ZIP_PAGE) { +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG UT_LIST_REMOVE(zip_list, buf_pool->zip_clean, &block->page); +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ ut_ad(!block->page.in_flush_list); } else { /* Relocate buf_pool->flush_list. */ @@ -2931,10 +2159,10 @@ wait_until_unfixed: buf_pool->n_pend_unzip++; mutex_exit(&buf_pool_mutex); - buf_buddy_free(bpage, sizeof *bpage, FALSE); - //buf_pool_mutex_exit(); + buf_page_free_descriptor(bpage); + /* Decompress the page and apply buffered operations while not holding buf_pool_mutex or block->mutex. */ success = buf_zip_decompress(block, srv_use_checksums); @@ -2981,9 +2209,9 @@ wait_until_unfixed: /* Try to evict the block from the buffer pool, to use the insert buffer as much as possible. */ - if (buf_LRU_free_block(&block->page, TRUE, FALSE) == BUF_LRU_FREED) { - buf_pool_mutex_exit(); - mutex_exit(&block->mutex); + if (buf_LRU_free_block(&block->page, TRUE, FALSE)) { + //buf_pool_mutex_exit(); + mutex_exit(block_mutex); fprintf(stderr, "innodb_change_buffering_debug evict %u %u\n", (unsigned) space, (unsigned) offset); @@ -3011,7 +2239,9 @@ wait_until_unfixed: //buf_pool_mutex_exit(); mutex_exit(block_mutex); - buf_page_set_accessed_make_young(&block->page, access_time); + if (UNIV_LIKELY(mode != BUF_PEEK_IF_IN_POOL)) { + buf_page_set_accessed_make_young(&block->page, access_time); + } #if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG ut_a(!block->page.file_page_was_freed); @@ -3077,7 +2307,7 @@ wait_until_unfixed: mtr_memo_push(mtr, block, fix_type); - if (!access_time) { + if (UNIV_LIKELY(mode != BUF_PEEK_IF_IN_POOL) && !access_time) { /* In the case of a first access, try to apply linear read-ahead */ @@ -3531,6 +2761,7 @@ buf_page_init_for_read( { buf_block_t* block; buf_page_t* bpage; + buf_page_t* bpage_in_bp; mtr_t mtr; ibool lru = FALSE; void* data; @@ -3566,11 +2797,29 @@ buf_page_init_for_read( ut_ad(block); } +retry: //buf_pool_mutex_enter(); mutex_enter(&LRU_list_mutex); rw_lock_x_lock(&page_hash_latch); - if (buf_page_hash_get(space, offset)) { + bpage_in_bp = buf_page_hash_get(space, offset); + + if (UNIV_UNLIKELY(bpage_in_bp && bpage_in_bp->space_was_being_deleted)) { + mutex_t* block_mutex = buf_page_get_mutex_enter(bpage_in_bp); + + /* This page is obsoleted, should discard and retry */ + rw_lock_x_unlock(&page_hash_latch); + ut_a(block_mutex); + + buf_LRU_free_block(bpage_in_bp, TRUE, TRUE); + + mutex_exit(&LRU_list_mutex); + mutex_exit(block_mutex); + + goto retry; + } + + if (bpage_in_bp) { /* The page is already in the buffer pool. */ err_exit: if (block) { @@ -3648,17 +2897,12 @@ err_exit: mutex_exit(&LRU_list_mutex); mutex_exit(&block->mutex); } else { - /* Defer buf_buddy_alloc() until after the block has - been found not to exist. The buf_buddy_alloc() and - buf_buddy_free() calls may be expensive because of - buf_buddy_relocate(). */ /* The compressed page must be allocated before the control block (bpage), in order to avoid the invocation of buf_buddy_relocate_block() on uninitialized data. */ data = buf_buddy_alloc(zip_size, &lru, TRUE); - bpage = buf_buddy_alloc(sizeof *bpage, &lru, TRUE); /* If buf_buddy_alloc() allocated storage from the LRU list, it released and reacquired buf_pool_mutex. Thus, we must @@ -3666,17 +2910,16 @@ err_exit: if (UNIV_UNLIKELY(lru) && UNIV_LIKELY_NULL(buf_page_hash_get(space, offset))) { - /* The block was added by some other thread. */ - buf_buddy_free(bpage, sizeof *bpage, TRUE); buf_buddy_free(data, zip_size, TRUE); mutex_exit(&LRU_list_mutex); rw_lock_x_unlock(&page_hash_latch); - bpage = NULL; goto func_exit; } + bpage = buf_page_alloc_descriptor(); + page_zip_des_init(&bpage->zip); page_zip_set_size(&bpage->zip, zip_size); bpage->zip.data = data; @@ -3706,9 +2949,11 @@ err_exit: /* The block must be put to the LRU list, to the old blocks */ buf_LRU_add_block(bpage, TRUE/* to old blocks */); +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG mutex_enter(&flush_list_mutex); buf_LRU_insert_zip_clean(bpage); mutex_exit(&flush_list_mutex); +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ mutex_exit(&LRU_list_mutex); @@ -3759,12 +3004,28 @@ buf_page_create( free_block = buf_LRU_get_free_block(); +retry: //buf_pool_mutex_enter(); mutex_enter(&LRU_list_mutex); rw_lock_x_lock(&page_hash_latch); block = (buf_block_t*) buf_page_hash_get(space, offset); + if (UNIV_UNLIKELY(block && block->page.space_was_being_deleted)) { + mutex_t* block_mutex = buf_page_get_mutex_enter((buf_page_t*)block); + + /* This page is obsoleted, should discard and retry */ + rw_lock_x_unlock(&page_hash_latch); + ut_a(block_mutex); + + buf_LRU_free_block((buf_page_t*)block, TRUE, TRUE); + + mutex_exit(&LRU_list_mutex); + mutex_exit(block_mutex); + + goto retry; + } + if (block && buf_page_in_file(&block->page)) { #ifdef UNIV_IBUF_COUNT_DEBUG ut_a(ibuf_count_get(space, offset) == 0); @@ -3889,8 +3150,7 @@ UNIV_INTERN void buf_page_io_complete( /*=================*/ - buf_page_t* bpage, /*!< in: pointer to the block in question */ - trx_t* trx) + buf_page_t* bpage) /*!< in: pointer to the block in question */ { enum buf_io_fix io_type; const ibool uncompressed = (buf_page_get_state(bpage) @@ -4011,14 +3271,18 @@ corrupt: if (srv_pass_corrupt_table && !trx_sys_sys_space(bpage->space) && bpage->space < SRV_LOG_SPACE_FIRST_ID) { + trx_t* trx; + fprintf(stderr, "InnoDB: space %u will be treated as corrupt.\n", bpage->space); fil_space_set_corrupt(bpage->space); - if (trx && trx->dict_operation_lock_mode == 0) { - dict_table_set_corrupt_by_space(bpage->space, TRUE); - } else { + + trx = innobase_get_trx(); + if (trx && trx->dict_operation_lock_mode == RW_X_LATCH) { dict_table_set_corrupt_by_space(bpage->space, FALSE); + } else { + dict_table_set_corrupt_by_space(bpage->space, TRUE); } bpage->is_corrupt = TRUE; } else @@ -4781,12 +4045,16 @@ buf_print_io( /* Statistics about read ahead algorithm */ fprintf(file, "Pages read ahead %.2f/s," - " evicted without access %.2f/s\n", + " evicted without access %.2f/s," + " Random read ahead %.2f/s\n", (buf_pool->stat.n_ra_pages_read - buf_pool->old_stat.n_ra_pages_read) / time_elapsed, (buf_pool->stat.n_ra_pages_evicted - buf_pool->old_stat.n_ra_pages_evicted) + / time_elapsed, + (buf_pool->stat.n_ra_pages_read_rnd + - buf_pool->old_stat.n_ra_pages_read_rnd) / time_elapsed); /* Print some values to help us with visualizing what is diff --git a/storage/xtradb/buf/buf0flu.c b/storage/xtradb/buf/buf0flu.c index cda8d3b170e..0ea3ed29d2b 100644 --- a/storage/xtradb/buf/buf0flu.c +++ b/storage/xtradb/buf/buf0flu.c @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved. +Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -459,7 +459,9 @@ buf_flush_remove( case BUF_BLOCK_ZIP_DIRTY: buf_page_set_state(bpage, BUF_BLOCK_ZIP_PAGE); UT_LIST_REMOVE(flush_list, buf_pool->flush_list, bpage); +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG buf_LRU_insert_zip_clean(bpage); +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ break; case BUF_BLOCK_FILE_PAGE: UT_LIST_REMOVE(flush_list, buf_pool->flush_list, bpage); @@ -769,7 +771,7 @@ corrupted_page: flush: /* Now flush the doublewrite buffer data to disk */ - fil_flush(srv_doublewrite_file ? TRX_DOUBLEWRITE_SPACE : TRX_SYS_SPACE); + fil_flush(srv_doublewrite_file ? TRX_DOUBLEWRITE_SPACE : TRX_SYS_SPACE, FALSE); /* We know that the writes have been flushed to disk now and in recovery we will find them in the doublewrite buffer @@ -1084,7 +1086,7 @@ buf_flush_page_try( /*===============*/ buf_block_t* block) /*!< in/out: buffer control block */ { - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); ut_ad(mutex_own(&block->mutex)); @@ -1092,8 +1094,11 @@ buf_flush_page_try( return(FALSE); } + buf_pool_mutex_enter(); + if (buf_pool->n_flush[BUF_FLUSH_LRU] > 0 || buf_pool->init_flush[BUF_FLUSH_LRU]) { + buf_pool_mutex_exit(); /* There is already a flush batch of the same type running */ return(FALSE); } diff --git a/storage/xtradb/buf/buf0lru.c b/storage/xtradb/buf/buf0lru.c index 583eec9bd9c..df74ccf500f 100644 --- a/storage/xtradb/buf/buf0lru.c +++ b/storage/xtradb/buf/buf0lru.c @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved. +Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -259,18 +259,20 @@ buf_LRU_drop_page_hash_for_tablespace( * BUF_LRU_DROP_SEARCH_HASH_SIZE); //buf_pool_mutex_enter(); mutex_enter(&LRU_list_mutex); + num_entries = 0; scan_again: - num_entries = 0; bpage = UT_LIST_GET_LAST(buf_pool->LRU); while (bpage != NULL) { + /* bpage->state,space,io_fix,buf_fix_count are protected by block_mutex at XtraDB */ mutex_t* block_mutex = buf_page_get_mutex_enter(bpage); buf_page_t* prev_bpage; + ibool is_fixed; prev_bpage = UT_LIST_GET_PREV(LRU, bpage); - if (!block_mutex) { + if (UNIV_UNLIKELY(!block_mutex)) { goto next_page; } @@ -278,59 +280,77 @@ scan_again: if (buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE || bpage->space != id - || bpage->buf_fix_count > 0 || bpage->io_fix != BUF_IO_NONE) { - /* We leave the fixed pages as is in this scan. - To be dealt with later in the final scan. */ + /* Compressed pages are never hashed. + Skip blocks of other tablespaces. + Skip I/O-fixed blocks (to be dealt with later). */ mutex_exit(block_mutex); - goto next_page; +next_page: + bpage = prev_bpage; + continue; } - if (((buf_block_t*) bpage)->is_hashed) { + //mutex_enter(&((buf_block_t*) bpage)->mutex); + is_fixed = bpage->buf_fix_count > 0 + || !((buf_block_t*) bpage)->is_hashed; + //mutex_exit(&((buf_block_t*) bpage)->mutex); - /* Store the offset(i.e.: page_no) in the array - so that we can drop hash index in a batch - later. */ - page_arr[num_entries] = bpage->offset; + if (is_fixed) { mutex_exit(block_mutex); - ut_a(num_entries < BUF_LRU_DROP_SEARCH_HASH_SIZE); - ++num_entries; + goto next_page; + } - if (num_entries < BUF_LRU_DROP_SEARCH_HASH_SIZE) { - goto next_page; - } - /* Array full. We release the buf_pool_mutex to - obey the latching order. */ - //buf_pool_mutex_exit(); - mutex_exit(&LRU_list_mutex); + /* Store the page number so that we can drop the hash + index in a batch later. */ + page_arr[num_entries] = bpage->offset; + mutex_exit(block_mutex); - buf_LRU_drop_page_hash_batch(id, zip_size, page_arr, - num_entries); - num_entries = 0; - //buf_pool_mutex_enter(); - mutex_enter(&LRU_list_mutex); - } else { - mutex_exit(block_mutex); + ut_a(num_entries < BUF_LRU_DROP_SEARCH_HASH_SIZE); + ++num_entries; + + if (num_entries < BUF_LRU_DROP_SEARCH_HASH_SIZE) { + goto next_page; } -next_page: - /* Note that we may have released the buf_pool mutex - above after reading the prev_bpage during processing - of a page_hash_batch (i.e.: when the array was full). - This means that prev_bpage can change in LRU list. - This is OK because this function is a 'best effort' - to drop as many search hash entries as possible and - it does not guarantee that ALL such entries will be - dropped. */ - bpage = prev_bpage; + /* Array full. We release the buf_pool_mutex to + obey the latching order. */ + //buf_pool_mutex_exit(); + mutex_exit(&LRU_list_mutex); + buf_LRU_drop_page_hash_batch(id, zip_size, page_arr, + num_entries); + //buf_pool_mutex_enter(); + mutex_enter(&LRU_list_mutex); + num_entries = 0; + + /* Note that we released the buf_pool mutex above + after reading the prev_bpage during processing of a + page_hash_batch (i.e.: when the array was full). + Because prev_bpage could belong to a compressed-only + block, it may have been relocated, and thus the + pointer cannot be trusted. Because bpage is of type + buf_block_t, it is safe to dereference. + + bpage can change in the LRU list. This is OK because + this function is a 'best effort' to drop as many + search hash entries as possible and it does not + guarantee that ALL such entries will be dropped. */ /* If, however, bpage has been removed from LRU list to the free list then we should restart the scan. bpage->state is protected by buf_pool mutex. */ - if (bpage && !buf_page_in_file(bpage)) { - ut_a(num_entries == 0); + + /* obtain block_mutex again to avoid race condition of bpage->state */ + block_mutex = buf_page_get_mutex_enter(bpage); + if (!block_mutex) { goto scan_again; } + + if (bpage + && buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE) { + mutex_exit(block_mutex); + goto scan_again; + } + mutex_exit(block_mutex); } //buf_pool_mutex_exit(); @@ -372,7 +392,7 @@ scan_again: while (bpage != NULL) { buf_page_t* prev_bpage; - ibool prev_bpage_buf_fix = FALSE; + mutex_t* block_mutex = NULL; ut_a(buf_page_in_file(bpage)); @@ -385,26 +405,28 @@ scan_again: if (buf_page_get_space(bpage) != id) { /* Skip this block, as it does not belong to the space that is being invalidated. */ + goto next_page; } else if (buf_page_get_io_fix(bpage) != BUF_IO_NONE) { /* We cannot remove this page during this scan yet; maybe the system is currently reading it in, or flushing the modifications to the file */ all_freed = FALSE; + goto next_page; } else { - mutex_t* block_mutex = buf_page_get_mutex_enter(bpage); + block_mutex = buf_page_get_mutex_enter(bpage); if (!block_mutex) { /* It may be impossible case... Something wrong, so will be scan_again */ all_freed = FALSE; - - goto next_page_no_mutex; + goto next_page; } if (bpage->buf_fix_count > 0) { + mutex_exit(block_mutex); /* We cannot remove this page during this scan yet; maybe the system is currently reading it in, or flushing @@ -414,108 +436,61 @@ scan_again: goto next_page; } + } + + ut_ad(mutex_own(block_mutex)); #ifdef UNIV_DEBUG - if (buf_debug_prints) { - fprintf(stderr, - "Dropping space %lu page %lu\n", - (ulong) buf_page_get_space(bpage), - (ulong) buf_page_get_page_no(bpage)); - } + if (buf_debug_prints) { + fprintf(stderr, + "Dropping space %lu page %lu\n", + (ulong) buf_page_get_space(bpage), + (ulong) buf_page_get_page_no(bpage)); + } #endif - if (buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE) { - /* This is a compressed-only block - descriptor. Ensure that prev_bpage - cannot be relocated when bpage is freed. */ - if (UNIV_LIKELY(prev_bpage != NULL)) { - switch (buf_page_get_state( - prev_bpage)) { - case BUF_BLOCK_FILE_PAGE: - /* Descriptors of uncompressed - blocks will not be relocated, - because we are holding the - buf_pool_mutex. */ - break; - case BUF_BLOCK_ZIP_PAGE: - case BUF_BLOCK_ZIP_DIRTY: - /* Descriptors of compressed- - only blocks can be relocated, - unless they are buffer-fixed. - Because both bpage and - prev_bpage are protected by - buf_pool_zip_mutex, it is - not necessary to acquire - further mutexes. */ - ut_ad(&buf_pool_zip_mutex - == block_mutex); - ut_ad(mutex_own(block_mutex)); - prev_bpage_buf_fix = TRUE; - prev_bpage->buf_fix_count++; - break; - default: - ut_error; - } - } - } else if (((buf_block_t*) bpage)->is_hashed) { - ulint page_no; - ulint zip_size; - - //buf_pool_mutex_exit(); - mutex_exit(&LRU_list_mutex); - rw_lock_x_unlock(&page_hash_latch); + if (buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE) { + /* This is a compressed-only block + descriptor. Do nothing. */ + } else if (((buf_block_t*) bpage)->is_hashed) { + ulint page_no; + ulint zip_size; - zip_size = buf_page_get_zip_size(bpage); - page_no = buf_page_get_page_no(bpage); + //buf_pool_mutex_exit(); + mutex_exit(&LRU_list_mutex); + rw_lock_x_unlock(&page_hash_latch); - mutex_exit(block_mutex); + zip_size = buf_page_get_zip_size(bpage); + page_no = buf_page_get_page_no(bpage); - /* Note that the following call will acquire - an S-latch on the page */ + mutex_exit(block_mutex); - btr_search_drop_page_hash_when_freed( - id, zip_size, page_no); - goto scan_again; - } + /* Note that the following call will acquire + an S-latch on the page */ - if (bpage->oldest_modification != 0) { + btr_search_drop_page_hash_when_freed( + id, zip_size, page_no); + goto scan_again; + } - buf_flush_remove(bpage); - } + if (bpage->oldest_modification != 0) { - /* Remove from the LRU list. */ + buf_flush_remove(bpage); + } - if (buf_LRU_block_remove_hashed_page(bpage, TRUE) - != BUF_BLOCK_ZIP_FREE) { - buf_LRU_block_free_hashed_page((buf_block_t*) - bpage, TRUE); - } else { - /* The block_mutex should have been - released by buf_LRU_block_remove_hashed_page() - when it returns BUF_BLOCK_ZIP_FREE. */ - ut_ad(block_mutex == &buf_pool_zip_mutex); - ut_ad(!mutex_own(block_mutex)); - - if (prev_bpage_buf_fix) { - /* We temporarily buffer-fixed - prev_bpage, so that - buf_buddy_free() could not - relocate it, in case it was a - compressed-only block - descriptor. */ - - mutex_enter(block_mutex); - ut_ad(prev_bpage->buf_fix_count > 0); - prev_bpage->buf_fix_count--; - mutex_exit(block_mutex); - } + /* Remove from the LRU list. */ - goto next_page_no_mutex; - } -next_page: + if (buf_LRU_block_remove_hashed_page(bpage, TRUE) + != BUF_BLOCK_ZIP_FREE) { + buf_LRU_block_free_hashed_page((buf_block_t*) bpage, TRUE); mutex_exit(block_mutex); + } else { + /* The block_mutex should have been released + by buf_LRU_block_remove_hashed_page() when it + returns BUF_BLOCK_ZIP_FREE. */ + ut_ad(block_mutex == &buf_pool_zip_mutex); + ut_ad(!mutex_own(block_mutex)); } - -next_page_no_mutex: +next_page: bpage = prev_bpage; } @@ -539,6 +514,8 @@ buf_LRU_mark_space_was_deleted( ulint id) /*!< in: space id */ { buf_page_t* bpage; + buf_chunk_t* chunk; + ulint i, j; mutex_enter(&LRU_list_mutex); @@ -552,8 +529,32 @@ buf_LRU_mark_space_was_deleted( } mutex_exit(&LRU_list_mutex); + + rw_lock_s_lock(&btr_search_latch); + chunk = buf_pool->chunks; + for (i = buf_pool->n_chunks; i--; chunk++) { + buf_block_t* block = chunk->blocks; + for (j = chunk->size; j--; block++) { + if (buf_block_get_state(block) + != BUF_BLOCK_FILE_PAGE + || !block->is_hashed + || buf_page_get_space(&block->page) != id) { + continue; + } + + rw_lock_s_unlock(&btr_search_latch); + + rw_lock_x_lock(&block->lock); + btr_search_drop_page_hash_index(block); + rw_lock_x_unlock(&block->lock); + + rw_lock_s_lock(&btr_search_latch); + } + } + rw_lock_s_unlock(&btr_search_latch); } +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG /********************************************************************//** Insert a compressed block into buf_pool->zip_clean in the LRU order. */ UNIV_INTERN @@ -587,6 +588,7 @@ buf_LRU_insert_zip_clean( UT_LIST_ADD_FIRST(zip_list, buf_pool->zip_clean, bpage); } } +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ /******************************************************************//** Try to free an uncompressed page of a compressed block from the unzip @@ -629,7 +631,7 @@ restart: UNIV_LIKELY(block != NULL) && UNIV_LIKELY(distance > 0); block = UT_LIST_GET_PREV(unzip_LRU, block), distance--) { - enum buf_lru_free_block_status freed; + ibool freed; mutex_enter(&block->mutex); if (!block->in_unzip_LRU_list || !block->page.in_LRU_list @@ -645,24 +647,9 @@ restart: freed = buf_LRU_free_block(&block->page, FALSE, have_LRU_mutex); mutex_exit(&block->mutex); - switch (freed) { - case BUF_LRU_FREED: + if (freed) { return(TRUE); - - case BUF_LRU_CANNOT_RELOCATE: - /* If we failed to relocate, try - regular LRU eviction. */ - return(FALSE); - - case BUF_LRU_NOT_FREED: - /* The block was buffer-fixed or I/O-fixed. - Keep looking. */ - continue; } - - /* inappropriate return value from - buf_LRU_free_block() */ - ut_error; } return(FALSE); @@ -695,10 +682,9 @@ restart: UNIV_LIKELY(bpage != NULL) && UNIV_LIKELY(distance > 0); bpage = UT_LIST_GET_PREV(LRU, bpage), distance--) { - enum buf_lru_free_block_status freed; - unsigned accessed; - mutex_t* block_mutex - = buf_page_get_mutex_enter(bpage); + ibool freed; + unsigned accessed; + mutex_t* block_mutex = buf_page_get_mutex_enter(bpage); if (!block_mutex) { goto restart; @@ -717,8 +703,7 @@ restart: freed = buf_LRU_free_block(bpage, TRUE, have_LRU_mutex); mutex_exit(block_mutex); - switch (freed) { - case BUF_LRU_FREED: + if (freed) { /* Keep track of pages that are evicted without ever being accessed. This gives us a measure of the effectiveness of readahead */ @@ -726,21 +711,7 @@ restart: ++buf_pool->stat.n_ra_pages_evicted; } return(TRUE); - - case BUF_LRU_NOT_FREED: - /* The block was dirty, buffer-fixed, or I/O-fixed. - Keep looking. */ - continue; - - case BUF_LRU_CANNOT_RELOCATE: - /* This should never occur, because we - want to discard the compressed page too. */ - break; } - - /* inappropriate return value from - buf_LRU_free_block() */ - ut_error; } return(FALSE); @@ -1457,17 +1428,16 @@ buf_LRU_make_block_old( Try to free a block. If bpage is a descriptor of a compressed-only page, the descriptor object will be freed as well. -NOTE: If this function returns BUF_LRU_FREED, it will temporarily +NOTE: If this function returns TRUE, it will temporarily release buf_pool_mutex. Furthermore, the page frame will no longer be accessible via bpage. The caller must hold buf_pool_mutex and buf_page_get_mutex(bpage) and release these two mutexes after the call. No other buf_page_get_mutex() may be held when calling this function. -@return BUF_LRU_FREED if freed, BUF_LRU_CANNOT_RELOCATE or -BUF_LRU_NOT_FREED otherwise. */ +@return TRUE if freed, FALSE otherwise. */ UNIV_INTERN -enum buf_lru_free_block_status +ibool buf_LRU_free_block( /*===============*/ buf_page_t* bpage, /*!< in: block to be freed */ @@ -1493,7 +1463,7 @@ buf_LRU_free_block( if (!bpage->in_LRU_list || !block_mutex || !buf_page_can_relocate(bpage)) { /* Do not free buffer-fixed or I/O-fixed blocks. */ - return(BUF_LRU_NOT_FREED); + return(FALSE); } if (bpage->space_was_being_deleted && bpage->oldest_modification != 0) { @@ -1509,7 +1479,7 @@ buf_LRU_free_block( /* Do not completely free dirty blocks. */ if (bpage->oldest_modification) { - return(BUF_LRU_NOT_FREED); + return(FALSE); } } else if (bpage->oldest_modification) { /* Do not completely free dirty blocks. */ @@ -1517,7 +1487,7 @@ buf_LRU_free_block( if (buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE) { ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_DIRTY); - return(BUF_LRU_NOT_FREED); + return(FALSE); } goto alloc; @@ -1526,14 +1496,8 @@ buf_LRU_free_block( If it cannot be allocated (without freeing a block from the LRU list), refuse to free bpage. */ alloc: - //buf_pool_mutex_exit_forbid(); - b = buf_buddy_alloc(sizeof *b, NULL, FALSE); - //buf_pool_mutex_exit_allow(); - - if (UNIV_UNLIKELY(!b)) { - return(BUF_LRU_CANNOT_RELOCATE); - } - + b = buf_page_alloc_descriptor(); + ut_a(b); //memcpy(b, bpage, sizeof *b); } @@ -1563,7 +1527,7 @@ not_freed: if (!have_LRU_mutex) mutex_exit(&LRU_list_mutex); rw_lock_x_unlock(&page_hash_latch); - return(BUF_LRU_NOT_FREED); + return(FALSE); } else if (zip || !bpage->zip.data) { if (bpage->oldest_modification) goto not_freed; @@ -1670,7 +1634,9 @@ not_freed: mutex_enter(&flush_list_mutex); if (b->state == BUF_BLOCK_ZIP_PAGE) { +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG buf_LRU_insert_zip_clean(b); +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ } else { /* Relocate on buf_pool->flush_list. */ buf_flush_relocate_on_flush_list(bpage, b); @@ -1746,7 +1712,7 @@ not_freed: rw_lock_x_unlock(&page_hash_latch); } - return(BUF_LRU_FREED); + return(TRUE); } /******************************************************************//** @@ -1967,15 +1933,16 @@ buf_LRU_block_remove_hashed_page( ut_a(bpage->zip.data); ut_a(buf_page_get_zip_size(bpage)); +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG UT_LIST_REMOVE(zip_list, buf_pool->zip_clean, bpage); +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ mutex_exit(&buf_pool_zip_mutex); //buf_pool_mutex_exit_forbid(); buf_buddy_free(bpage->zip.data, page_zip_get_size(&bpage->zip), TRUE); - buf_buddy_free(bpage, sizeof(*bpage), TRUE); //buf_pool_mutex_exit_allow(); - UNIV_MEM_UNDESC(bpage); + buf_page_free_descriptor(bpage); return(BUF_BLOCK_ZIP_FREE); case BUF_BLOCK_FILE_PAGE: @@ -2284,6 +2251,11 @@ buf_LRU_file_restore(void) " InnoDB: cannot open %s\n", LRU_DUMP_FILE); goto end; } + + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: Restoring buffer pool pages from %s\n", + LRU_DUMP_FILE); + if (size == 0 || size_high > 0 || size % 8) { fprintf(stderr, " InnoDB: broken LRU dump file\n"); goto end; @@ -2385,7 +2357,7 @@ buf_LRU_file_restore(void) ut_print_timestamp(stderr); fprintf(stderr, - " InnoDB: reading pages based on the dumped LRU list was done." + " InnoDB: Completed reading buffer pool pages" " (requested: %lu, read: %lu)\n", req, reads); ret = TRUE; end: diff --git a/storage/xtradb/buf/buf0rea.c b/storage/xtradb/buf/buf0rea.c index 59de70d9a8a..966a9b5d7a6 100644 --- a/storage/xtradb/buf/buf0rea.c +++ b/storage/xtradb/buf/buf0rea.c @@ -38,6 +38,14 @@ Created 11/5/1995 Heikki Tuuri #include "srv0start.h" #include "srv0srv.h" +/** The size in blocks of the area where the random read-ahead algorithm counts +the accessed pages when deciding whether to read-ahead */ +#define BUF_READ_AHEAD_RANDOM_AREA BUF_READ_AHEAD_AREA + +/** There must be at least this many pages in buf_pool in the area to start +a random read-ahead */ +#define BUF_READ_AHEAD_RANDOM_THRESHOLD (5 + BUF_READ_AHEAD_RANDOM_AREA / 8) + /** The linear read-ahead area size */ #define BUF_READ_AHEAD_LINEAR_AREA BUF_READ_AHEAD_AREA @@ -201,13 +209,179 @@ not_to_recover: if (sync) { /* The i/o is already completed when we arrive from fil_read */ - buf_page_io_complete(bpage, trx); + buf_page_io_complete(bpage); } return(1); } /********************************************************************//** +Applies a random read-ahead in buf_pool if there are at least a threshold +value of accessed pages from the random read-ahead area. Does not read any +page, not even the one at the position (space, offset), if the read-ahead +mechanism is not activated. NOTE 1: the calling thread may own latches on +pages: to avoid deadlocks this function must be written such that it cannot +end up waiting for these latches! NOTE 2: the calling thread must want +access to the page given: this rule is set to prevent unintended read-aheads +performed by ibuf routines, a situation which could result in a deadlock if +the OS does not support asynchronous i/o. +@return number of page read requests issued; NOTE that if we read ibuf +pages, it may happen that the page at the given page number does not +get read even if we return a positive value! */ +static +ulint +buf_read_ahead_random( +/*==================*/ + ulint space, /*!< in: space id */ + ulint zip_size,/*!< in: compressed page size in bytes, or 0 */ + ulint offset, /*!< in: page number of a page which the current thread + wants to access */ + trx_t* trx) +{ + ib_int64_t tablespace_version; + ulint recent_blocks = 0; + ulint count; + ulint ibuf_mode; + ulint low, high; + ulint err; + ulint i; + ulint buf_read_ahead_random_area; + + if (!srv_random_read_ahead) { + /* Disabled by user */ + return(0); + } + + if (srv_startup_is_before_trx_rollback_phase) { + /* No read-ahead to avoid thread deadlocks */ + return(0); + } + + if (ibuf_bitmap_page(zip_size, offset) + || trx_sys_hdr_page(space, offset)) { + + /* If it is an ibuf bitmap page or trx sys hdr, we do + no read-ahead, as that could break the ibuf page access + order */ + + return(0); + } + + /* Remember the tablespace version before we ask the tablespace size + below: if DISCARD + IMPORT changes the actual .ibd file meanwhile, we + do not try to read outside the bounds of the tablespace! */ + + tablespace_version = fil_space_get_version(space); + + buf_read_ahead_random_area = BUF_READ_AHEAD_RANDOM_AREA; + + low = (offset / buf_read_ahead_random_area) + * buf_read_ahead_random_area; + high = (offset / buf_read_ahead_random_area + 1) + * buf_read_ahead_random_area; + if (high > fil_space_get_size(space)) { + + high = fil_space_get_size(space); + } + + //buf_pool_mutex_enter(); + mutex_enter(&buf_pool_mutex); + + if (buf_pool->n_pend_reads + > buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) { + //buf_pool_mutex_exit(); + mutex_exit(&buf_pool_mutex); + + return(0); + } + mutex_exit(&buf_pool_mutex); + + /* Count how many blocks in the area have been recently accessed, + that is, reside near the start of the LRU list. */ + + rw_lock_s_lock(&page_hash_latch); + for (i = low; i < high; i++) { + const buf_page_t* bpage = buf_page_hash_get(space, i); + + if (bpage + && buf_page_is_accessed(bpage) + && buf_page_peek_if_young(bpage)) { + + recent_blocks++; + + if (recent_blocks >= BUF_READ_AHEAD_RANDOM_THRESHOLD) { + + //buf_pool_mutex_exit(); + rw_lock_s_unlock(&page_hash_latch); + goto read_ahead; + } + } + } + + //buf_pool_mutex_exit(); + rw_lock_s_unlock(&page_hash_latch); + /* Do nothing */ + return(0); + +read_ahead: + /* Read all the suitable blocks within the area */ + + if (ibuf_inside()) { + ibuf_mode = BUF_READ_IBUF_PAGES_ONLY; + } else { + ibuf_mode = BUF_READ_ANY_PAGE; + } + + count = 0; + + for (i = low; i < high; i++) { + /* It is only sensible to do read-ahead in the non-sync aio + mode: hence FALSE as the first parameter */ + + if (!ibuf_bitmap_page(zip_size, i)) { + count += buf_read_page_low( + &err, FALSE, + ibuf_mode | OS_AIO_SIMULATED_WAKE_LATER, + space, zip_size, FALSE, + tablespace_version, i, trx); + if (err == DB_TABLESPACE_DELETED) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Warning: in random" + " readahead trying to access\n" + "InnoDB: tablespace %lu page %lu,\n" + "InnoDB: but the tablespace does not" + " exist or is just being dropped.\n", + (ulong) space, (ulong) i); + } + } + } + + /* In simulated aio we wake the aio handler threads only after + queuing all aio requests, in native aio the following call does + nothing: */ + + os_aio_simulated_wake_handler_threads(); + +#ifdef UNIV_DEBUG + if (buf_debug_prints && (count > 0)) { + fprintf(stderr, + "Random read-ahead space %lu offset %lu pages %lu\n", + (ulong) space, (ulong) offset, + (ulong) count); + } +#endif /* UNIV_DEBUG */ + + /* Read ahead is considered one I/O operation for the purpose of + LRU policy decision. */ + buf_LRU_stat_inc_io(); + + buf_pool->stat.n_ra_pages_read_rnd += count; + return(count); +} + + +/********************************************************************//** High-level function which reads a page asynchronously from a file to the buffer buf_pool if it is not already there. Sets the io_fix flag and sets an exclusive lock on the buffer frame. The flag is cleared and the x-lock @@ -226,6 +400,9 @@ buf_read_page( ulint count; ulint err; + count = buf_read_ahead_random(space, zip_size, offset, trx); + srv_buf_pool_reads += count; + tablespace_version = fil_space_get_version(space); /* We do the i/o in the synchronous aio mode to save thread diff --git a/storage/xtradb/dict/dict0dict.c b/storage/xtradb/dict/dict0dict.c index 18880a5c72c..a8a1d7a11e6 100644 --- a/storage/xtradb/dict/dict0dict.c +++ b/storage/xtradb/dict/dict0dict.c @@ -4524,6 +4524,8 @@ dict_store_statistics( break; } + btr_pcur_store_position(&pcur, &mtr); + if (rec_get_deleted_flag(rec, 0)) { /* don't count */ i--; @@ -4564,6 +4566,10 @@ dict_store_statistics( rests--; next_rec: + mtr_commit(&mtr); + mtr_start(&mtr); + btr_pcur_restore_position(BTR_MODIFY_LEAF, &pcur, &mtr); + btr_pcur_move_to_next_user_rec(&pcur, &mtr); } btr_pcur_close(&pcur); @@ -4654,6 +4660,7 @@ dict_update_statistics( do { if (table->is_corrupt) { ut_a(srv_pass_corrupt_table); + dict_table_stats_unlock(table, RW_X_LATCH); return; } diff --git a/storage/xtradb/dict/dict0load.c b/storage/xtradb/dict/dict0load.c index edd77e2530f..64d7fad3557 100644 --- a/storage/xtradb/dict/dict0load.c +++ b/storage/xtradb/dict/dict0load.c @@ -554,9 +554,10 @@ dict_load_columns( } /********************************************************************//** -Loads definitions for index fields. */ +Loads definitions for index fields. +@return DB_SUCCESS if ok, DB_CORRUPTION if failed */ static -void +ulint dict_load_fields( /*=============*/ dict_index_t* index, /*!< in: index whose fields to load */ @@ -575,6 +576,7 @@ dict_load_fields( byte* buf; ulint i; mtr_t mtr; + ulint error = DB_SUCCESS; ut_ad(mutex_own(&(dict_sys->mutex))); @@ -641,6 +643,26 @@ dict_load_fields( field = rec_get_nth_field_old(rec, 4, &len); + if (prefix_len >= DICT_MAX_INDEX_COL_LEN) { + fprintf(stderr, "InnoDB: Error: load index" + " '%s' failed.\n" + "InnoDB: index field '%s' has a prefix" + " length of %lu bytes,\n" + "InnoDB: which exceeds the" + " maximum limit of %lu bytes.\n" + "InnoDB: Please use server that" + " supports long index prefix\n" + "InnoDB: or turn on" + " innodb_force_recovery to load" + " the table\n", + index->name, mem_heap_strdupl( + heap, (char*) field, len), + (ulong) prefix_len, + (ulong) (DICT_MAX_INDEX_COL_LEN - 1)); + error = DB_CORRUPTION; + goto func_exit; + } + dict_mem_index_add_field(index, mem_heap_strdupl(heap, (char*) field, len), @@ -650,8 +672,10 @@ next_rec: btr_pcur_move_to_next_user_rec(&pcur, &mtr); } +func_exit: btr_pcur_close(&pcur); mtr_commit(&mtr); + return(error); } /********************************************************************//** @@ -802,7 +826,25 @@ dict_load_indexes( space, type, n_fields); index->id = id; - dict_load_fields(index, heap); + error = dict_load_fields(index, heap); + + if (error != DB_SUCCESS) { + fprintf(stderr, "InnoDB: Error: load index '%s'" + " for table '%s' failed\n", + index->name, table->name); + + /* If the force recovery flag is set, and + if the failed index is not the primary index, we + will continue and open other indexes */ + if (srv_force_recovery + && !(index->type & DICT_CLUSTERED)) { + error = DB_SUCCESS; + goto next_rec; + } else { + goto func_exit; + } + } + error = dict_index_add_to_cache(table, index, page_no, FALSE); /* The data dictionary tables should never contain @@ -1028,9 +1070,18 @@ err_exit: } else { table->fk_max_recusive_level = 0; } - } else if (!srv_force_recovery) { - dict_table_remove_from_cache(table); - table = NULL; + } else { + dict_index_t* index; + + /* Make sure that at least the clustered index was loaded. + Otherwise refuse to load the table */ + index = dict_table_get_first_index(table); + + if (!srv_force_recovery || !index + || !(index->type & DICT_CLUSTERED)) { + dict_table_remove_from_cache(table); + table = NULL; + } } #if 0 if (err != DB_SUCCESS && table != NULL) { diff --git a/storage/xtradb/dict/dict0mem.c b/storage/xtradb/dict/dict0mem.c index f2d219bfd4f..c3da053c2de 100644 --- a/storage/xtradb/dict/dict0mem.c +++ b/storage/xtradb/dict/dict0mem.c @@ -36,6 +36,9 @@ Created 1/8/1996 Heikki Tuuri #ifndef UNIV_HOTBACKUP # include "lock0lock.h" #endif /* !UNIV_HOTBACKUP */ +#ifdef UNIV_BLOB_DEBUG +# include "ut0rbt.h" +#endif /* UNIV_BLOB_DEBUG */ #define DICT_HEAP_SIZE 100 /*!< initial memory heap size when creating a table or index object */ @@ -318,6 +321,12 @@ dict_mem_index_free( { ut_ad(index); ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); +#ifdef UNIV_BLOB_DEBUG + if (index->blobs) { + mutex_free(&index->blobs_mutex); + rbt_free(index->blobs); + } +#endif /* UNIV_BLOB_DEBUG */ mem_heap_free(index->heap); } diff --git a/storage/xtradb/fil/fil0fil.c b/storage/xtradb/fil/fil0fil.c index 1cb8d565119..1ef0a9a46fb 100644 --- a/storage/xtradb/fil/fil0fil.c +++ b/storage/xtradb/fil/fil0fil.c @@ -46,6 +46,8 @@ Created 10/25/1995 Heikki Tuuri #include "row0mysql.h" #include "row0row.h" #include "que0que.h" +#include "btr0btr.h" +#include "btr0sea.h" #ifndef UNIV_HOTBACKUP # include "buf0lru.h" # include "ibuf0ibuf.h" @@ -817,7 +819,7 @@ fil_node_close_file( ut_ad(node && system); ut_ad(mutex_own(&(system->mutex))); ut_a(node->open); - ut_a(node->n_pending == 0 || srv_lazy_drop_table); + ut_a(node->n_pending == 0 || node->space->is_being_deleted); ut_a(node->n_pending_flushes == 0); ut_a(node->modification_counter == node->flush_counter); @@ -830,7 +832,7 @@ fil_node_close_file( ut_a(system->n_open > 0); system->n_open--; - if (node->space->purpose == FIL_TABLESPACE && !trx_sys_sys_space(node->space->id)) { + if (node->n_pending == 0 && node->space->purpose == FIL_TABLESPACE && !trx_sys_sys_space(node->space->id)) { ut_a(UT_LIST_GET_LEN(system->LRU) > 0); /* The node is in the LRU list, remove it */ @@ -1029,7 +1031,7 @@ fil_node_free( ut_ad(node && system && space); ut_ad(mutex_own(&(system->mutex))); ut_a(node->magic_n == FIL_NODE_MAGIC_N); - ut_a(node->n_pending == 0 || srv_lazy_drop_table); + ut_a(node->n_pending == 0 || space->is_being_deleted); if (node->open) { /* We fool the assertion in fil_node_close_file() to think @@ -2579,7 +2581,7 @@ retry: os_thread_sleep(20000); - fil_flush(id); + fil_flush(id, TRUE); goto retry; @@ -2792,7 +2794,7 @@ error_exit2: goto error_exit; } - ret = os_file_flush(file); + ret = os_file_flush(file, TRUE); if (!ret) { fputs("InnoDB: Error: file flush of tablespace ", stderr); @@ -2977,7 +2979,7 @@ fil_reset_too_high_lsns( } } - success = os_file_flush(file); + success = os_file_flush(file, TRUE); if (!success) { goto func_exit; @@ -2999,7 +3001,7 @@ fil_reset_too_high_lsns( goto func_exit; } - success = os_file_flush(file); + success = os_file_flush(file, TRUE); func_exit: os_file_close(file); ut_free(buf2); @@ -3009,6 +3011,97 @@ func_exit: } /********************************************************************//** +Checks if a page is corrupt. (for offline page) +*/ +static +ibool +fil_page_buf_page_is_corrupted_offline( +/*===================================*/ + const byte* page, /*!< in: a database page */ + ulint zip_size) /*!< in: size of compressed page; + 0 for uncompressed pages */ +{ + ulint checksum_field; + ulint old_checksum_field; + + if (!zip_size + && memcmp(page + FIL_PAGE_LSN + 4, + page + UNIV_PAGE_SIZE + - FIL_PAGE_END_LSN_OLD_CHKSUM + 4, 4)) { + return(TRUE); + } + + checksum_field = mach_read_from_4(page + + FIL_PAGE_SPACE_OR_CHKSUM); + + if (zip_size) { + return(checksum_field != BUF_NO_CHECKSUM_MAGIC + && checksum_field + != page_zip_calc_checksum(page, zip_size)); + } + + old_checksum_field = mach_read_from_4( + page + UNIV_PAGE_SIZE + - FIL_PAGE_END_LSN_OLD_CHKSUM); + + if (old_checksum_field != mach_read_from_4(page + + FIL_PAGE_LSN) + && old_checksum_field != BUF_NO_CHECKSUM_MAGIC + && old_checksum_field + != buf_calc_page_old_checksum(page)) { + return(TRUE); + } + + if (!srv_fast_checksum + && checksum_field != 0 + && checksum_field != BUF_NO_CHECKSUM_MAGIC + && checksum_field + != buf_calc_page_new_checksum(page)) { + return(TRUE); + } + + if (srv_fast_checksum + && checksum_field != 0 + && checksum_field != BUF_NO_CHECKSUM_MAGIC + && checksum_field + != buf_calc_page_new_checksum_32(page) + && checksum_field + != buf_calc_page_new_checksum(page)) { + return(TRUE); + } + + return(FALSE); +} + +/********************************************************************//** +*/ +static +void +fil_page_buf_page_store_checksum( +/*=============================*/ + byte* page, + ulint zip_size) +{ + if (!zip_size) { + mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, + srv_use_checksums + ? (!srv_fast_checksum + ? buf_calc_page_new_checksum(page) + : buf_calc_page_new_checksum_32(page)) + : BUF_NO_CHECKSUM_MAGIC); + mach_write_to_4(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM, + srv_use_checksums + ? buf_calc_page_old_checksum(page) + : BUF_NO_CHECKSUM_MAGIC); + } else { + mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, + srv_use_checksums + ? page_zip_calc_checksum(page, zip_size) + : BUF_NO_CHECKSUM_MAGIC); + } +} + +/********************************************************************//** Tries to open a single-table tablespace and optionally checks the space id is right in it. If does not succeed, prints an error message to the .err log. This function is used to open a tablespace when we start up mysqld, and also in @@ -3123,6 +3216,7 @@ fil_open_single_table_tablespace( fil_system_t* system; fil_node_t* node = NULL; fil_space_t* space; + ulint zip_size; buf3 = ut_malloc(2 * UNIV_PAGE_SIZE); descr_page = ut_align(buf3, UNIV_PAGE_SIZE); @@ -3140,12 +3234,15 @@ fil_open_single_table_tablespace( /* store as first descr page */ memcpy(descr_page, page, UNIV_PAGE_SIZE); + zip_size = dict_table_flags_to_zip_size(flags); + ut_a(zip_size == dict_table_flags_to_zip_size(space_flags)); + /* get free limit (page number) of the table space */ /* these should be same to the definition in fsp0fsp.c */ #define FSP_HEADER_OFFSET FIL_PAGE_DATA #define FSP_FREE_LIMIT 12 free_limit = mach_read_from_4(FSP_HEADER_OFFSET + FSP_FREE_LIMIT + page); - free_limit_bytes = (ib_int64_t)free_limit * (ib_int64_t)UNIV_PAGE_SIZE; + free_limit_bytes = (ib_int64_t)free_limit * (ib_int64_t)(zip_size ? zip_size : UNIV_PAGE_SIZE); /* overwrite fsp header */ fsp_header_init_fields(page, id, flags); @@ -3154,16 +3251,9 @@ fil_open_single_table_tablespace( space_flags = flags; if (mach_read_ull(page + FIL_PAGE_FILE_FLUSH_LSN) > current_lsn) mach_write_ull(page + FIL_PAGE_FILE_FLUSH_LSN, current_lsn); - mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, - srv_use_checksums - ? (!srv_fast_checksum - ? buf_calc_page_new_checksum(page) - : buf_calc_page_new_checksum_32(page)) - : BUF_NO_CHECKSUM_MAGIC); - mach_write_to_4(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM, - srv_use_checksums - ? buf_calc_page_old_checksum(page) - : BUF_NO_CHECKSUM_MAGIC); + + fil_page_buf_page_store_checksum(page, zip_size); + success = os_file_write(filepath, file, page, 0, 0, UNIV_PAGE_SIZE); /* get file size */ @@ -3173,7 +3263,7 @@ fil_open_single_table_tablespace( if (size_bytes < free_limit_bytes) { free_limit_bytes = size_bytes; - if (size_bytes >= (ib_int64_t) (FSP_EXTENT_SIZE * UNIV_PAGE_SIZE)) { + if (size_bytes >= (ib_int64_t) (FSP_EXTENT_SIZE * (zip_size ? zip_size : UNIV_PAGE_SIZE))) { fprintf(stderr, "InnoDB: free limit of %s is larger than its real size.\n", filepath); file_is_corrupt = TRUE; } @@ -3237,75 +3327,41 @@ skip_info: size_bytes = ut_2pow_round(size_bytes, 1024 * 1024); } */ - if (!(flags & DICT_TF_ZSSIZE_MASK)) { + + if (zip_size) { + fprintf(stderr, "InnoDB: Warning: importing compressed table is still EXPERIMENTAL, currently.\n"); + } + + { mem_heap_t* heap = NULL; ulint offsets_[REC_OFFS_NORMAL_SIZE]; ulint* offsets = offsets_; ib_int64_t offset; - size = (ulint) (size_bytes / UNIV_PAGE_SIZE); + size = (ulint) (size_bytes / (zip_size ? zip_size : UNIV_PAGE_SIZE)); /* over write space id of all pages */ rec_offs_init(offsets_); fprintf(stderr, "InnoDB: Progress in %%:"); - for (offset = 0; offset < free_limit_bytes; offset += UNIV_PAGE_SIZE) { - ulint checksum_field; - ulint old_checksum_field; + for (offset = 0; offset < free_limit_bytes; + offset += zip_size ? zip_size : UNIV_PAGE_SIZE) { ibool page_is_corrupt; success = os_file_read(file, page, (ulint)(offset & 0xFFFFFFFFUL), - (ulint)(offset >> 32), UNIV_PAGE_SIZE); + (ulint)(offset >> 32), + zip_size ? zip_size : UNIV_PAGE_SIZE); page_is_corrupt = FALSE; /* check consistency */ - if (memcmp(page + FIL_PAGE_LSN + 4, - page + UNIV_PAGE_SIZE - - FIL_PAGE_END_LSN_OLD_CHKSUM + 4, 4)) { - + if (fil_page_buf_page_is_corrupted_offline(page, zip_size)) { page_is_corrupt = TRUE; } if (mach_read_from_4(page + FIL_PAGE_OFFSET) - != offset / UNIV_PAGE_SIZE) { - - page_is_corrupt = TRUE; - } - - checksum_field = mach_read_from_4(page - + FIL_PAGE_SPACE_OR_CHKSUM); - - old_checksum_field = mach_read_from_4( - page + UNIV_PAGE_SIZE - - FIL_PAGE_END_LSN_OLD_CHKSUM); - - if (old_checksum_field != mach_read_from_4(page - + FIL_PAGE_LSN) - && old_checksum_field != BUF_NO_CHECKSUM_MAGIC - && old_checksum_field - != buf_calc_page_old_checksum(page)) { - - page_is_corrupt = TRUE; - } - - if (!srv_fast_checksum - && checksum_field != 0 - && checksum_field != BUF_NO_CHECKSUM_MAGIC - && checksum_field - != buf_calc_page_new_checksum(page)) { - - page_is_corrupt = TRUE; - } - - if (srv_fast_checksum - && checksum_field != 0 - && checksum_field != BUF_NO_CHECKSUM_MAGIC - && checksum_field - != buf_calc_page_new_checksum_32(page) - && checksum_field - != buf_calc_page_new_checksum(page)) { + != offset / (zip_size ? zip_size : UNIV_PAGE_SIZE)) { page_is_corrupt = TRUE; } @@ -3316,7 +3372,8 @@ skip_info: /* it should be overwritten already */ ut_a(!page_is_corrupt); - } else if (!((offset / UNIV_PAGE_SIZE) % UNIV_PAGE_SIZE)) { + } else if (!((offset / (zip_size ? zip_size : UNIV_PAGE_SIZE)) + % (zip_size ? zip_size : UNIV_PAGE_SIZE))) { /* descr page (not header) */ if (page_is_corrupt) { file_is_corrupt = TRUE; @@ -3327,7 +3384,7 @@ skip_info: } /* store as descr page */ - memcpy(descr_page, page, UNIV_PAGE_SIZE); + memcpy(descr_page, page, (zip_size ? zip_size : UNIV_PAGE_SIZE)); } else if (descr_is_corrupt) { /* unknown state of the page */ @@ -3355,9 +3412,12 @@ skip_info: ulint bit_index; descr = descr_page + XDES_ARR_OFFSET - + XDES_SIZE * (ut_2pow_remainder((offset / UNIV_PAGE_SIZE), UNIV_PAGE_SIZE) / FSP_EXTENT_SIZE); + + XDES_SIZE * (ut_2pow_remainder( + (offset / (zip_size ? zip_size : UNIV_PAGE_SIZE)), + (zip_size ? zip_size : UNIV_PAGE_SIZE)) / FSP_EXTENT_SIZE); - index = XDES_FREE_BIT + XDES_BITS_PER_PAGE * ((offset / UNIV_PAGE_SIZE) % FSP_EXTENT_SIZE); + index = XDES_FREE_BIT + + XDES_BITS_PER_PAGE * ((offset / (zip_size ? zip_size : UNIV_PAGE_SIZE)) % FSP_EXTENT_SIZE); byte_index = index / 8; bit_index = index % 8; @@ -3375,7 +3435,7 @@ skip_info: } if (page_is_corrupt) { - fprintf(stderr, " [errp:%ld]", (long) (offset / UNIV_PAGE_SIZE)); + fprintf(stderr, " [errp:%ld]", (long) (offset / (zip_size ? zip_size : UNIV_PAGE_SIZE))); /* cannot treat corrupt page */ goto skip_write; @@ -3385,7 +3445,13 @@ skip_info: mach_write_to_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, id); for (i = 0; (ulint) i < n_index; i++) { - if ((ulint) (offset / UNIV_PAGE_SIZE) == root_page[i]) { + if ((ulint) (offset / (zip_size ? zip_size : UNIV_PAGE_SIZE)) == root_page[i]) { + if (fil_page_get_type(page) != FIL_PAGE_INDEX) { + file_is_corrupt = TRUE; + fprintf(stderr, " [etyp:%lld]", + offset / (zip_size ? zip_size : UNIV_PAGE_SIZE)); + goto skip_write; + } /* this is index root page */ mach_write_to_4(page + FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF + FSEG_HDR_SPACE, id); @@ -3398,7 +3464,14 @@ skip_info: if (fil_page_get_type(page) == FIL_PAGE_INDEX) { dulint tmp = mach_read_from_8(page + (PAGE_HEADER + PAGE_INDEX_ID)); - if (mach_read_from_2(page + PAGE_HEADER + PAGE_LEVEL) == 0 + for (i = 0; i < n_index; i++) { + if (ut_dulint_cmp(old_id[i], tmp) == 0) { + mach_write_to_8(page + (PAGE_HEADER + PAGE_INDEX_ID), new_id[i]); + break; + } + } + + if (!zip_size && mach_read_from_2(page + PAGE_HEADER + PAGE_LEVEL) == 0 && ut_dulint_cmp(old_id[0], tmp) == 0) { /* leaf page of cluster index, reset trx_id of records */ rec_t* rec; @@ -3417,7 +3490,7 @@ skip_info: ULINT_UNDEFINED, &heap); n_fields = rec_offs_n_fields(offsets); if (!offset) { - offset = row_get_trx_id_offset(rec, index, offsets); + offset = row_get_trx_id_offset(index, offsets); } trx_write_trx_id(rec + offset, ut_dulint_create(0, 1)); @@ -3437,44 +3510,34 @@ skip_info: rec = page_rec_get_next(rec); n_recs--; } - } - - for (i = 0; i < n_index; i++) { - if (ut_dulint_cmp(old_id[i], tmp) == 0) { - mach_write_to_8(page + (PAGE_HEADER + PAGE_INDEX_ID), new_id[i]); - break; - } + } else if (mach_read_from_2(page + PAGE_HEADER + PAGE_LEVEL) == 0 + && ut_dulint_cmp(old_id[0], tmp) != 0) { + mach_write_to_8(page + (PAGE_HEADER + PAGE_MAX_TRX_ID), ut_dulint_create(0, 1)); } } if (mach_read_ull(page + FIL_PAGE_LSN) > current_lsn) { mach_write_ull(page + FIL_PAGE_LSN, current_lsn); - mach_write_ull(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM, - current_lsn); + if (!zip_size) { + mach_write_ull(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM, + current_lsn); + } } - mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, - srv_use_checksums - ? (!srv_fast_checksum - ? buf_calc_page_new_checksum(page) - : buf_calc_page_new_checksum_32(page)) - : BUF_NO_CHECKSUM_MAGIC); - mach_write_to_4(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM, - srv_use_checksums - ? buf_calc_page_old_checksum(page) - : BUF_NO_CHECKSUM_MAGIC); + fil_page_buf_page_store_checksum(page, zip_size); success = os_file_write(filepath, file, page, (ulint)(offset & 0xFFFFFFFFUL), - (ulint)(offset >> 32), UNIV_PAGE_SIZE); + (ulint)(offset >> 32), + zip_size ? zip_size : UNIV_PAGE_SIZE); } skip_write: if (free_limit_bytes - && ((ib_int64_t)((offset + UNIV_PAGE_SIZE) * 100) / free_limit_bytes) + && ((ib_int64_t)((offset + (zip_size ? zip_size : UNIV_PAGE_SIZE)) * 100) / free_limit_bytes) != ((offset * 100) / free_limit_bytes)) { fprintf(stderr, " %lu", - (ulong)((ib_int64_t)((offset + UNIV_PAGE_SIZE) * 100) / free_limit_bytes)); + (ulong)((ib_int64_t)((offset + (zip_size ? zip_size : UNIV_PAGE_SIZE)) * 100) / free_limit_bytes)); } } @@ -3530,13 +3593,6 @@ skip_write: if (UNIV_LIKELY_NULL(heap)) { mem_heap_free(heap); } - } else { - /* zip page? */ - size = (ulint) - (size_bytes - / dict_table_flags_to_zip_size(flags)); - fprintf(stderr, "InnoDB: import: table %s seems to be in newer format." - " It may not be able to treated for now.\n", name); } /* .exp file should be removed */ success = os_file_delete(info_file_path); @@ -3618,6 +3674,271 @@ func_exit: os_file_close(file); mem_free(filepath); + if (srv_expand_import && dict_table_flags_to_zip_size(flags)) { + ulint page_no; + ulint zip_size; + ulint height; + ulint root_height = 0; + rec_t* node_ptr; + dict_table_t* table; + dict_index_t* index; + buf_block_t* block; + page_t* page; + page_zip_des_t* page_zip; + mtr_t mtr; + + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + + rec_offs_init(offsets_); + + zip_size = dict_table_flags_to_zip_size(flags); + + table = dict_table_get_low(name); + index = dict_table_get_first_index(table); + page_no = dict_index_get_page(index); + ut_a(page_no == 3); + + fprintf(stderr, "InnoDB: It is compressed .ibd file. need to convert additionaly on buffer pool.\n"); + + /* down to leaf */ + mtr_start(&mtr); + mtr_set_log_mode(&mtr, MTR_LOG_NONE); + + height = ULINT_UNDEFINED; + + for (;;) { + block = buf_page_get(space_id, zip_size, page_no, + RW_NO_LATCH, &mtr); + page = buf_block_get_frame(block); + + block->check_index_page_at_flush = TRUE; + + if (height == ULINT_UNDEFINED) { + height = btr_page_get_level(page, &mtr); + root_height = height; + } + + if (height == 0) { + break; + } + + node_ptr = page_rec_get_next(page_get_infimum_rec(page)); + + height--; + + offsets = rec_get_offsets(node_ptr, index, offsets, ULINT_UNDEFINED, &heap); + page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets); + } + + mtr_commit(&mtr); + + fprintf(stderr, "InnoDB: pages needs split are ..."); + + /* scan reaf pages */ + while (page_no != FIL_NULL) { + rec_t* rec; + rec_t* supremum; + ulint n_recs; + + mtr_start(&mtr); + + block = buf_page_get(space_id, zip_size, page_no, + RW_X_LATCH, &mtr); + page = buf_block_get_frame(block); + page_zip = buf_block_get_page_zip(block); + + if (!page_zip) { + /*something wrong*/ + fprintf(stderr, "InnoDB: Something wrong with reading page %lu.\n", page_no); +convert_err_exit: + mtr_commit(&mtr); + mutex_enter(&fil_system->mutex); + fil_space_free(space_id, FALSE); + mutex_exit(&fil_system->mutex); + success = FALSE; + goto convert_exit; + } + + supremum = page_get_supremum_rec(page); + rec = page_rec_get_next(page_get_infimum_rec(page)); + n_recs = page_get_n_recs(page); + + /* illegal operation as InnoDB online system. so not logged */ + while (rec && rec != supremum && n_recs > 0) { + ulint n_fields; + ulint i; + ulint offset = index->trx_id_offset; + + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + n_fields = rec_offs_n_fields(offsets); + if (!offset) { + offset = row_get_trx_id_offset(index, offsets); + } + trx_write_trx_id(rec + offset, ut_dulint_create(0, 1)); + + for (i = 0; i < n_fields; i++) { + if (rec_offs_nth_extern(offsets, i)) { + ulint local_len; + byte* data; + + data = rec_get_nth_field(rec, offsets, i, &local_len); + + local_len -= BTR_EXTERN_FIELD_REF_SIZE; + + mach_write_to_4(data + local_len + BTR_EXTERN_SPACE_ID, id); + } + } + + rec = page_rec_get_next(rec); + n_recs--; + } + + /* dummy logged update for along with modified page path */ + if (ut_dulint_cmp(index->id, btr_page_get_index_id(page)) != 0) { + /* this should be adjusted already */ + fprintf(stderr, "InnoDB: The page %lu seems to be converted wrong.\n", page_no); + goto convert_err_exit; + } + btr_page_set_index_id(page, page_zip, index->id, &mtr); + + /* confirm whether fits to the page size or not */ + if (!page_zip_compress(page_zip, page, index, &mtr) + && !btr_page_reorganize(block, index, &mtr)) { + buf_block_t* new_block; + page_t* new_page; + page_zip_des_t* new_page_zip; + rec_t* split_rec; + ulint n_uniq; + + /* split page is needed */ + fprintf(stderr, " %lu", page_no); + + mtr_x_lock(dict_index_get_lock(index), &mtr); + + n_uniq = dict_index_get_n_unique_in_tree(index); + + if(page_get_n_recs(page) < 2) { + /* no way to make smaller */ + fprintf(stderr, "InnoDB: The page %lu cannot be store to the page size.\n", page_no); + goto convert_err_exit; + } + + if (UNIV_UNLIKELY(page_no == dict_index_get_page(index))) { + ulint new_page_no; + dtuple_t* node_ptr; + ulint level; + rec_t* node_ptr_rec; + page_cur_t page_cursor; + + /* it is root page, need to raise before split */ + + level = btr_page_get_level(page, &mtr); + + new_block = btr_page_alloc(index, 0, FSP_NO_DIR, level, &mtr); + new_page = buf_block_get_frame(new_block); + new_page_zip = buf_block_get_page_zip(new_block); + btr_page_create(new_block, new_page_zip, index, level, &mtr); + + btr_page_set_next(new_page, new_page_zip, FIL_NULL, &mtr); + btr_page_set_prev(new_page, new_page_zip, FIL_NULL, &mtr); + + page_zip_copy_recs(new_page_zip, new_page, + page_zip, page, index, &mtr); + btr_search_move_or_delete_hash_entries(new_block, block, index); + + rec = page_rec_get_next(page_get_infimum_rec(new_page)); + new_page_no = buf_block_get_page_no(new_block); + + node_ptr = dict_index_build_node_ptr(index, rec, new_page_no, heap, + level); + dtuple_set_info_bits(node_ptr, + dtuple_get_info_bits(node_ptr) + | REC_INFO_MIN_REC_FLAG); + btr_page_empty(block, page_zip, index, level + 1, &mtr); + + btr_page_set_next(page, page_zip, FIL_NULL, &mtr); + btr_page_set_prev(page, page_zip, FIL_NULL, &mtr); + + page_cur_set_before_first(block, &page_cursor); + + node_ptr_rec = page_cur_tuple_insert(&page_cursor, node_ptr, + index, 0, &mtr); + ut_a(node_ptr_rec); + + if (!btr_page_reorganize(block, index, &mtr)) { + fprintf(stderr, "InnoDB: failed to store the page %lu.\n", page_no); + goto convert_err_exit; + } + + /* move to the raised page */ + page_no = new_page_no; + block = new_block; + page = new_page; + page_zip = new_page_zip; + + fprintf(stderr, "(raise_to:%lu)", page_no); + } + + split_rec = page_get_middle_rec(page); + + new_block = btr_page_alloc(index, page_no + 1, FSP_UP, + btr_page_get_level(page, &mtr), &mtr); + new_page = buf_block_get_frame(new_block); + new_page_zip = buf_block_get_page_zip(new_block); + btr_page_create(new_block, new_page_zip, index, + btr_page_get_level(page, &mtr), &mtr); + + offsets = rec_get_offsets(split_rec, index, offsets, n_uniq, &heap); + + btr_attach_half_pages(index, block, + split_rec, new_block, FSP_UP, &mtr); + + page_zip_copy_recs(new_page_zip, new_page, + page_zip, page, index, &mtr); + page_delete_rec_list_start(split_rec - page + new_page, + new_block, index, &mtr); + btr_search_move_or_delete_hash_entries(new_block, block, index); + page_delete_rec_list_end(split_rec, block, index, + ULINT_UNDEFINED, ULINT_UNDEFINED, &mtr); + + fprintf(stderr, "(new:%lu)", buf_block_get_page_no(new_block)); + + /* Are they needed? */ + if (!btr_page_reorganize(block, index, &mtr)) { + fprintf(stderr, "InnoDB: failed to store the page %lu.\n", page_no); + goto convert_err_exit; + } + if (!btr_page_reorganize(new_block, index, &mtr)) { + fprintf(stderr, "InnoDB: failed to store the page %lu.\n", buf_block_get_page_no(new_block)); + goto convert_err_exit; + } + } + + page_no = btr_page_get_next(page, &mtr); + + mtr_commit(&mtr); + + if (heap) { + mem_heap_empty(heap); + } + } + + fprintf(stderr, "...done.\nInnoDB: waiting the flush batch of the additional conversion.\n"); + + /* should wait for the not-logged changes are all flushed */ + buf_flush_batch(BUF_FLUSH_LIST, ULINT_MAX, mtr.end_lsn + 1); + buf_flush_wait_batch_end(BUF_FLUSH_LIST); + + fprintf(stderr, "InnoDB: done.\n"); +convert_exit: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + } + return(success); } #endif /* !UNIV_HOTBACKUP */ @@ -4466,7 +4787,7 @@ fil_extend_space_to_desired_size( mutex_exit(&fil_system->mutex); mutex_exit(&fil_system->file_extend_mutex); - fil_flush(space_id); + fil_flush(space_id, TRUE); return(success); } @@ -4835,7 +5156,7 @@ _fil_io( && ((buf_page_t*)message)->space_was_being_deleted) { if (mode == OS_AIO_NORMAL) { - buf_page_io_complete(message, trx); + buf_page_io_complete(message); return(DB_SUCCESS); /*fake*/ } if (type == OS_FILE_READ) { @@ -4946,14 +5267,14 @@ _fil_io( ut_a(byte_offset % OS_FILE_LOG_BLOCK_SIZE == 0); ut_a((len % OS_FILE_LOG_BLOCK_SIZE) == 0); - if (srv_pass_corrupt_table && space->is_corrupt) { + if (srv_pass_corrupt_table == 1 && space->is_corrupt) { /* should ignore i/o for the crashed space */ mutex_enter(&fil_system->mutex); fil_node_complete_io(node, fil_system, type); mutex_exit(&fil_system->mutex); if (mode == OS_AIO_NORMAL) { ut_a(space->purpose == FIL_TABLESPACE); - buf_page_io_complete(message, trx); + buf_page_io_complete(message); } if (type == OS_FILE_READ) { return(DB_TABLESPACE_DELETED); @@ -4961,7 +5282,19 @@ _fil_io( return(DB_SUCCESS); } } else { - ut_a(!space->is_corrupt); + if (srv_pass_corrupt_table > 1 && space->is_corrupt) { + /* should ignore write i/o for the crashed space */ + if (type == OS_FILE_WRITE) { + mutex_enter(&fil_system->mutex); + fil_node_complete_io(node, fil_system, type); + mutex_exit(&fil_system->mutex); + if (mode == OS_AIO_NORMAL) { + ut_a(space->purpose == FIL_TABLESPACE); + buf_page_io_complete(message); + } + return(DB_SUCCESS); + } + } #ifdef UNIV_HOTBACKUP /* In ibbackup do normal i/o, not aio */ if (type == OS_FILE_READ) { @@ -5122,7 +5455,7 @@ fil_aio_wait( || buf_page_get_state(message) != BUF_BLOCK_FILE_PAGE); srv_set_io_thread_op_info(segment, "complete io for buf page"); - buf_page_io_complete(message, NULL); + buf_page_io_complete(message); return; } @@ -5146,7 +5479,7 @@ fil_aio_wait( if (fil_node->space->purpose == FIL_TABLESPACE) { srv_set_io_thread_op_info(segment, "complete io for buf page"); - buf_page_io_complete(message, NULL); + buf_page_io_complete(message); } else { srv_set_io_thread_op_info(segment, "complete io for log"); log_io_complete(message); @@ -5161,8 +5494,9 @@ UNIV_INTERN void fil_flush( /*======*/ - ulint space_id) /*!< in: file space id (this can be a group of + ulint space_id, /*!< in: file space id (this can be a group of log files or a tablespace of the database) */ + ibool metadata) { fil_space_t* space; fil_node_t* node; @@ -5233,7 +5567,7 @@ retry: /* fprintf(stderr, "Flushing to file %s\n", node->name); */ - os_file_flush(file); + os_file_flush(file, metadata); mutex_enter(&fil_system->mutex); @@ -5316,7 +5650,7 @@ fil_flush_file_spaces( a non-existing space id. */ for (i = 0; i < n_space_ids; i++) { - fil_flush(space_ids[i]); + fil_flush(space_ids[i], TRUE); } mem_free(space_ids); diff --git a/storage/xtradb/ha/hash0hash.c b/storage/xtradb/ha/hash0hash.c index 0f4fc55d895..30c304dafcd 100644 --- a/storage/xtradb/ha/hash0hash.c +++ b/storage/xtradb/ha/hash0hash.c @@ -128,70 +128,6 @@ hash_create( } /*************************************************************//** -*/ -UNIV_INTERN -ulint -hash_create_needed( -/*===============*/ - ulint n) -{ - ulint prime; - ulint offset; - - prime = ut_find_prime(n); - - offset = (sizeof(hash_table_t) + 7) / 8; - offset *= 8; - - return(offset + sizeof(hash_cell_t) * prime); -} - -UNIV_INTERN -void -hash_create_init( -/*=============*/ - hash_table_t* table, - ulint n) -{ - ulint prime; - ulint offset; - - prime = ut_find_prime(n); - - offset = (sizeof(hash_table_t) + 7) / 8; - offset *= 8; - - table->array = (hash_cell_t*)(((byte*)table) + offset); - table->n_cells = prime; -# if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG - table->adaptive = FALSE; -# endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ - table->n_mutexes = 0; - table->mutexes = NULL; - table->heaps = NULL; - table->heap = NULL; - ut_d(table->magic_n = HASH_TABLE_MAGIC_N); - - /* Initialize the cell array */ - hash_table_clear(table); -} - -UNIV_INTERN -void -hash_create_reuse( -/*==============*/ - hash_table_t* table) -{ - ulint offset; - - offset = (sizeof(hash_table_t) + 7) / 8; - offset *= 8; - - table->array = (hash_cell_t*)(((byte*)table) + offset); - ut_ad(table->magic_n == HASH_TABLE_MAGIC_N); -} - -/*************************************************************//** Frees a hash table. */ UNIV_INTERN void diff --git a/storage/xtradb/handler/ha_innodb.cc b/storage/xtradb/handler/ha_innodb.cc index e0510663e30..d980bb4e05d 100644 --- a/storage/xtradb/handler/ha_innodb.cc +++ b/storage/xtradb/handler/ha_innodb.cc @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 2000, 2010, MySQL AB & Innobase Oy. All Rights Reserved. +Copyright (c) 2000, 2011, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2008, 2009 Google Inc. Copyright (c) 2009, Percona Inc. @@ -196,11 +196,14 @@ static my_bool innobase_create_status_file = FALSE; static my_bool innobase_stats_on_metadata = TRUE; static my_bool innobase_use_sys_stats_table = FALSE; static my_bool innobase_buffer_pool_shm_checksum = TRUE; +static uint innobase_buffer_pool_shm_key = 0; static char* internal_innobase_data_file_path = NULL; static char* innodb_version_str = (char*) INNODB_VERSION_STR; +static my_bool innobase_blocking_lru_restore = FALSE; + /** Possible values for system variable "innodb_stats_method". The values are defined the same as its corresponding MyISAM system variable "myisam_stats_method"(see "myisam_stats_method_names"), for better usability */ @@ -364,6 +367,12 @@ static MYSQL_THDVAR_ULONG(flush_log_at_trx_commit_session, PLUGIN_VAR_RQCMDARG, "The value 3 regards innodb_flush_log_at_trx_commit (default).", NULL, NULL, 3, 0, 3, 0); +static MYSQL_THDVAR_BOOL(fake_changes, PLUGIN_VAR_OPCMDARG, + "In the transaction after enabled, UPDATE, INSERT and DELETE only move the cursor to the records " + "and do nothing other operations (no changes, no ibuf, no undo, no transaction log) in the transaction. " + "This is to cause replication prefetch IO. ATTENTION: the transaction started after enabled is affected.", + NULL, NULL, FALSE); + static handler *innobase_create_handler(handlerton *hton, TABLE_SHARE *table, @@ -533,6 +542,8 @@ static SHOW_VAR innodb_status_variables[]= { (char*) &export_vars.innodb_buffer_pool_pages_misc, SHOW_LONG}, {"buffer_pool_pages_total", (char*) &export_vars.innodb_buffer_pool_pages_total, SHOW_LONG}, + {"buffer_pool_read_ahead_rnd", + (char*) &export_vars.innodb_buffer_pool_read_ahead_rnd, SHOW_LONG}, {"buffer_pool_read_ahead", (char*) &export_vars.innodb_buffer_pool_read_ahead, SHOW_LONG}, {"buffer_pool_read_ahead_evicted", @@ -1395,6 +1406,8 @@ innobase_trx_init( trx->check_unique_secondary = !thd_test_options( thd, OPTION_RELAXED_UNIQUE_CHECKS); + trx->fake_changes = THDVAR(thd, fake_changes); + #ifdef EXTENDED_SLOWLOG if (thd_log_slow_verbosity(thd) & SLOG_V_INNODB) { trx->take_stats = TRUE; @@ -2457,6 +2470,12 @@ innobase_change_buffering_inited_ok: srv_buf_pool_size = (ulint) innobase_buffer_pool_size; + if (innobase_buffer_pool_shm_key) { + fprintf(stderr, + "InnoDB: Warning: innodb_buffer_pool_shm_key is deprecated function.\n" + "InnoDB: innodb_buffer_pool_shm_key was ignored.\n"); + } + srv_mem_pool_size = (ulint) innobase_additional_mem_pool_size; srv_n_file_io_threads = (ulint) innobase_file_io_threads; @@ -2473,7 +2492,8 @@ innobase_change_buffering_inited_ok: srv_use_doublewrite_buf = (ibool) innobase_use_doublewrite; srv_use_checksums = (ibool) innobase_use_checksums; srv_fast_checksum = (ibool) innobase_fast_checksum; - srv_buffer_pool_shm_checksum = (ibool) innobase_buffer_pool_shm_checksum; + + srv_blocking_lru_restore = (ibool) innobase_blocking_lru_restore; #ifdef HAVE_LARGE_PAGES if ((os_use_large_pages = (ibool) my_use_large_pages)) @@ -2511,6 +2531,10 @@ innobase_change_buffering_inited_ok: innobase_commit_concurrency_init_default(); +#ifndef EXTENDED_FOR_KILLIDLE + srv_kill_idle_transaction = 0; +#endif + /* Since we in this module access directly the fields of a trx struct, and due to different headers and flags it might happen that mutex_t has a different size in this module and in InnoDB @@ -2808,6 +2832,11 @@ innobase_commit( trx_search_latch_release_if_reserved(trx); } + if (trx->fake_changes && (all || (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)))) { + innobase_rollback(hton, thd, all); /* rollback implicitly */ + thd->main_da.reset_diagnostics_area(); /* because debug assertion code complains, if something left */ + DBUG_RETURN(HA_ERR_WRONG_COMMAND); + } /* The flag trx->active_trans is set to 1 in 1. ::external_lock(), @@ -3798,7 +3827,7 @@ ha_innobase::open( DBUG_RETURN(1); } - if (share->ib_table && share->ib_table->is_corrupt) { + if (srv_pass_corrupt_table <= 1 && share->ib_table && share->ib_table->is_corrupt) { free_share(share); DBUG_RETURN(HA_ERR_CRASHED_ON_USAGE); @@ -3831,18 +3860,18 @@ retry: /* Get pointer to a table object in InnoDB dictionary cache */ ib_table = dict_table_get(norm_name, TRUE); - if (ib_table && ib_table->is_corrupt) { + if (srv_pass_corrupt_table <= 1 && ib_table && ib_table->is_corrupt) { free_share(share); my_free(upd_buff, MYF(0)); DBUG_RETURN(HA_ERR_CRASHED_ON_USAGE); } - if (share->ib_table) { - ut_a(share->ib_table == ib_table); - } else { - share->ib_table = ib_table; - } + share->ib_table = ib_table; + + + + if (NULL == ib_table) { if (is_part && retries < 10) { @@ -4139,25 +4168,6 @@ field_in_record_is_null( return(0); } -/**************************************************************//** -Sets a field in a record to SQL NULL. Uses the record format -information in table to track the null bit in record. */ -static inline -void -set_field_in_record_to_null( -/*========================*/ - TABLE* table, /*!< in: MySQL table object */ - Field* field, /*!< in: MySQL field object */ - char* record) /*!< in: a row in MySQL format */ -{ - int null_offset; - - null_offset = (uint) ((char*) field->null_ptr - - (char*) table->record[0]); - - record[null_offset] = record[null_offset] | field->null_bit; -} - /*************************************************************//** InnoDB uses this function to compare two data fields for which the data type is such that we must use MySQL code to compare them. NOTE that the prototype @@ -5802,7 +5812,7 @@ ha_innobase::index_read( ha_statistic_increment(&SSV::ha_read_key_count); - if (share->ib_table->is_corrupt) { + if (srv_pass_corrupt_table <= 1 && share->ib_table->is_corrupt) { DBUG_RETURN(HA_ERR_CRASHED); } @@ -5871,7 +5881,7 @@ ha_innobase::index_read( ret = DB_UNSUPPORTED; } - if (share->ib_table->is_corrupt) { + if (srv_pass_corrupt_table <= 1 && share->ib_table->is_corrupt) { DBUG_RETURN(HA_ERR_CRASHED); } @@ -5990,7 +6000,7 @@ ha_innobase::change_active_index( { DBUG_ENTER("change_active_index"); - if (share->ib_table->is_corrupt) { + if (srv_pass_corrupt_table <= 1 && share->ib_table->is_corrupt) { DBUG_RETURN(HA_ERR_CRASHED); } @@ -6085,7 +6095,7 @@ ha_innobase::general_fetch( DBUG_ENTER("general_fetch"); - if (share->ib_table->is_corrupt) { + if (srv_pass_corrupt_table <= 1 && share->ib_table->is_corrupt) { DBUG_RETURN(HA_ERR_CRASHED); } @@ -6098,7 +6108,7 @@ ha_innobase::general_fetch( innodb_srv_conc_exit_innodb(prebuilt->trx); - if (share->ib_table->is_corrupt) { + if (srv_pass_corrupt_table <= 1 && share->ib_table->is_corrupt) { DBUG_RETURN(HA_ERR_CRASHED); } @@ -6439,10 +6449,6 @@ create_table_def( DBUG_PRINT("enter", ("table_name: %s", table_name)); ut_a(trx->mysql_thd != NULL); - if (IS_MAGIC_TABLE_AND_USER_DENIED_ACCESS(table_name, - (THD*) trx->mysql_thd)) { - DBUG_RETURN(HA_ERR_GENERIC); - } /* MySQL does the name length check. But we do additional check on the name length here */ @@ -6562,6 +6568,8 @@ err_col: col_len); } + srv_lower_case_table_names = lower_case_table_names; + error = row_create_table_for_mysql(table, trx); if (error == DB_DUPLICATE_KEY) { @@ -6978,42 +6986,17 @@ ha_innobase::create( DBUG_RETURN(HA_ERR_TO_BIG_ROW); } - /* Get the transaction associated with the current thd, or create one - if not yet created */ - - parent_trx = check_trx_exists(thd); - - /* In case MySQL calls this in the middle of a SELECT query, release - possible adaptive hash latch to avoid deadlocks of threads */ - - trx_search_latch_release_if_reserved(parent_trx); - - trx = innobase_trx_allocate(thd); - - if (lower_case_table_names) { - srv_lower_case_table_names = TRUE; - } else { - srv_lower_case_table_names = FALSE; - } - strcpy(name2, name); normalize_table_name(norm_name, name2); - /* Latch the InnoDB data dictionary exclusively so that no deadlocks - or lock waits can happen in it during a table create operation. - Drop table etc. do this latching in row0mysql.c. */ - - row_mysql_lock_data_dictionary(trx); - /* Create the table definition in InnoDB */ flags = 0; /* Validate create options if innodb_strict_mode is set. */ if (!create_options_are_valid(thd, form, create_info)) { - error = ER_ILLEGAL_HA_CREATE_OPTION; - goto cleanup; + DBUG_RETURN(ER_ILLEGAL_HA_CREATE_OPTION); } if (create_info->key_block_size) { @@ -7155,16 +7138,43 @@ ha_innobase::create( /* Check for name conflicts (with reserved name) for any user indices to be created. */ - if (innobase_index_name_is_reserved(trx, form->key_info, + if (innobase_index_name_is_reserved(thd, form->key_info, form->s->keys)) { - error = -1; - goto cleanup; + DBUG_RETURN(-1); + } + + if (IS_MAGIC_TABLE_AND_USER_DENIED_ACCESS(norm_name, thd)) { + DBUG_RETURN(HA_ERR_GENERIC); } if (create_info->options & HA_LEX_CREATE_TMP_TABLE) { flags |= DICT_TF2_TEMPORARY << DICT_TF2_SHIFT; } + /* Get the transaction associated with the current thd, or create one + if not yet created */ + + parent_trx = check_trx_exists(thd); + + /* In case MySQL calls this in the middle of a SELECT query, release + possible adaptive hash latch to avoid deadlocks of threads */ + + trx_search_latch_release_if_reserved(parent_trx); + + trx = innobase_trx_allocate(thd); + + if (trx->fake_changes) { + innobase_commit_low(trx); + trx_free_for_mysql(trx); + DBUG_RETURN(HA_ERR_WRONG_COMMAND); + } + + /* Latch the InnoDB data dictionary exclusively so that no deadlocks + or lock waits can happen in it during a table create operation. + Drop table etc. do this latching in row0mysql.c. */ + + row_mysql_lock_data_dictionary(trx); + error = create_table_def(trx, form, norm_name, create_info->options & HA_LEX_CREATE_TMP_TABLE ? name2 : NULL, flags); @@ -7364,6 +7374,10 @@ ha_innobase::delete_all_rows(void) DBUG_RETURN(HA_ERR_CRASHED); } + if (prebuilt->trx->fake_changes) { + goto fallback; + } + /* Truncate the table in InnoDB */ error = row_truncate_table_for_mysql(prebuilt->table, prebuilt->trx); @@ -7424,10 +7438,10 @@ ha_innobase::delete_table( trx = innobase_trx_allocate(thd); - if (lower_case_table_names) { - srv_lower_case_table_names = TRUE; - } else { - srv_lower_case_table_names = FALSE; + if (trx->fake_changes) { + innobase_commit_low(trx); + trx_free_for_mysql(trx); + DBUG_RETURN(HA_ERR_WRONG_COMMAND); } name_len = strlen(name); @@ -7436,6 +7450,8 @@ ha_innobase::delete_table( /* Drop the table in InnoDB */ + srv_lower_case_table_names = lower_case_table_names; + error = row_drop_table_for_mysql(norm_name, trx, thd_sql_command(thd) == SQLCOM_DROP_DB); @@ -7516,6 +7532,12 @@ innobase_drop_database( trx->mysql_thd = NULL; #else trx = innobase_trx_allocate(thd); + if (trx->fake_changes) { + my_free(namebuf, MYF(0)); + innobase_commit_low(trx); + trx_free_for_mysql(trx); + return; /* ignore */ + } #endif row_drop_database_for_mysql(namebuf, trx); my_free(namebuf, MYF(0)); @@ -7552,12 +7574,6 @@ innobase_rename_table( char* norm_from; DBUG_ENTER("innobase_rename_table"); - if (lower_case_table_names) { - srv_lower_case_table_names = TRUE; - } else { - srv_lower_case_table_names = FALSE; - } - // Magic number 64 arbitrary norm_to = (char*) my_malloc(strlen(to) + 64, MYF(0)); norm_from = (char*) my_malloc(strlen(from) + 64, MYF(0)); @@ -7572,6 +7588,8 @@ innobase_rename_table( row_mysql_lock_data_dictionary(trx); } + srv_lower_case_table_names = lower_case_table_names; + error = row_rename_table_for_mysql( norm_from, norm_to, trx, lock_and_commit); @@ -7629,6 +7647,11 @@ ha_innobase::rename_table( trx_search_latch_release_if_reserved(parent_trx); trx = innobase_trx_allocate(thd); + if (trx->fake_changes) { + innobase_commit_low(trx); + trx_free_for_mysql(trx); + DBUG_RETURN(HA_ERR_WRONG_COMMAND); + } error = innobase_rename_table(trx, from, to, TRUE); @@ -8132,6 +8155,8 @@ ha_innobase::info_low( if (flag & HA_STATUS_VARIABLE) { + ulint page_size; + dict_table_stats_lock(ib_table, RW_S_LATCH); n_rows = ib_table->stat_n_rows; @@ -8174,14 +8199,19 @@ ha_innobase::info_low( prebuilt->autoinc_last_value = 0; } + page_size = dict_table_zip_size(ib_table); + if (page_size == 0) { + page_size = UNIV_PAGE_SIZE; + } + stats.records = (ha_rows)n_rows; stats.deleted = 0; - stats.data_file_length = ((ulonglong) - ib_table->stat_clustered_index_size) - * UNIV_PAGE_SIZE; - stats.index_file_length = ((ulonglong) - ib_table->stat_sum_of_other_index_sizes) - * UNIV_PAGE_SIZE; + stats.data_file_length + = ((ulonglong) ib_table->stat_clustered_index_size) + * page_size; + stats.index_file_length = + ((ulonglong) ib_table->stat_sum_of_other_index_sizes) + * page_size; dict_table_stats_unlock(ib_table, RW_S_LATCH); @@ -9136,10 +9166,18 @@ ha_innobase::external_lock( reset_template(prebuilt); - if (lock_type == F_WRLCK) { + if (lock_type == F_WRLCK + || (table->s->tmp_table + && thd_sql_command(thd) == SQLCOM_LOCK_TABLES)) { /* If this is a SELECT, then it is in UPDATE TABLE ... - or SELECT ... FOR UPDATE */ + or SELECT ... FOR UPDATE + + For temporary tables which are locked for READ by LOCK TABLES + updates are still allowed by SQL-layer. In order to accomodate + for such a situation we always request X-lock for such table + at LOCK TABLES time. + */ prebuilt->select_lock_type = LOCK_X; prebuilt->stored_select_lock_type = LOCK_X; } @@ -10386,6 +10424,10 @@ innobase_xa_prepare( return(0); } + if (trx->fake_changes) { + return(0); + } + thd_get_xid(thd, (MYSQL_XID*) &trx->xid); /* Release a possible FIFO ticket and search latch. Since we will @@ -10503,7 +10545,7 @@ innobase_commit_by_xid( if (trx) { innobase_commit_low(trx); - + trx_free_for_background(trx); return(XA_OK); } else { return(XAER_NOTA); @@ -10529,7 +10571,9 @@ innobase_rollback_by_xid( trx = trx_get_trx_by_xid(xid); if (trx) { - return(innobase_rollback_trx(trx)); + int ret = innobase_rollback_trx(trx); + trx_free_for_background(trx); + return(ret); } else { return(XAER_NOTA); } @@ -11221,19 +11265,19 @@ static int show_innodb_vars(THD *thd, SHOW_VAR *var, char *buff) return 0; } -/*********************************************************************** +/*********************************************************************//** This function checks each index name for a table against reserved -system default primary index name 'GEN_CLUST_INDEX'. If a name matches, -this function pushes an warning message to the client, and returns true. */ +system default primary index name 'GEN_CLUST_INDEX'. If a name +matches, this function pushes an warning message to the client, +and returns true. +@return true if the index name matches the reserved name */ extern "C" UNIV_INTERN bool innobase_index_name_is_reserved( /*============================*/ - /* out: true if an index name - matches the reserved name */ - const trx_t* trx, /* in: InnoDB transaction handle */ - const KEY* key_info, /* in: Indexes to be created */ - ulint num_of_keys) /* in: Number of indexes to + THD* thd, /*!< in/out: MySQL connection */ + const KEY* key_info, /*!< in: Indexes to be created */ + ulint num_of_keys) /*!< in: Number of indexes to be created. */ { const KEY* key; @@ -11245,7 +11289,7 @@ innobase_index_name_is_reserved( if (innobase_strcasecmp(key->name, innobase_index_reserve_name) == 0) { /* Push warning to mysql */ - push_warning_printf((THD*) trx->mysql_thd, + push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN, ER_WRONG_NAME_FOR_INDEX, "Cannot Create Index with name " @@ -11264,6 +11308,48 @@ innobase_index_name_is_reserved( return(false); } +/*********************************************************************** +functions for kill session of idle transaction */ +extern "C" +ibool +innobase_thd_is_idle( +/*=================*/ + const void* thd) /*!< in: thread handle (THD*) */ +{ +#ifdef EXTENDED_FOR_KILLIDLE + return(thd_command((const THD*) thd) == COM_SLEEP); +#else + return(FALSE); +#endif +} + +extern "C" +ib_int64_t +innobase_thd_get_start_time( +/*========================*/ + const void* thd) /*!< in: thread handle (THD*) */ +{ +#ifdef EXTENDED_FOR_KILLIDLE + return((ib_int64_t)thd_start_time((const THD*) thd)); +#else + return(0); /*dummy value*/ +#endif +} + +extern "C" +void +innobase_thd_kill( +/*==============*/ + void* thd) +{ +#ifdef EXTENDED_FOR_KILLIDLE + thd_kill((THD*) thd); +#else + return; +#endif +} + + static SHOW_VAR innodb_status_variables_export[]= { {"Innodb", (char*) &show_innodb_vars, SHOW_FUNC}, {NullS, NullS, SHOW_LONG} @@ -11323,7 +11409,7 @@ static MYSQL_SYSVAR_BOOL(recovery_stats, innobase_recovery_stats, static MYSQL_SYSVAR_ULINT(use_purge_thread, srv_use_purge_thread, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, "Number of purge devoted threads. #### over 1 is EXPERIMENTAL ####", - NULL, NULL, 1, 0, 64, 0); + NULL, NULL, 1, 0, UNIV_MAX_PARALLELISM, 0); static MYSQL_SYSVAR_BOOL(overwrite_relay_log_info, innobase_overwrite_relay_log_info, PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY, @@ -11497,14 +11583,14 @@ static MYSQL_SYSVAR_LONGLONG(buffer_pool_size, innobase_buffer_pool_size, "The size of the memory buffer InnoDB uses to cache data and indexes of its tables.", NULL, NULL, 128*1024*1024L, 32*1024*1024L, LONGLONG_MAX, 1024*1024L); -static MYSQL_SYSVAR_UINT(buffer_pool_shm_key, srv_buffer_pool_shm_key, +static MYSQL_SYSVAR_UINT(buffer_pool_shm_key, innobase_buffer_pool_shm_key, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, - "[experimental] The key value of shared memory segment for the buffer pool. 0 (default) disables the feature.", + "[Deprecated option] no effect", NULL, NULL, 0, 0, INT_MAX32, 0); static MYSQL_SYSVAR_BOOL(buffer_pool_shm_checksum, innobase_buffer_pool_shm_checksum, PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY, - "Enable buffer_pool_shm checksum validation (enabled by default).", + "[Deprecated option] no effect", NULL, NULL, TRUE); static MYSQL_SYSVAR_ULONG(commit_concurrency, innobase_commit_concurrency, @@ -11517,6 +11603,15 @@ static MYSQL_SYSVAR_ULONG(concurrency_tickets, srv_n_free_tickets_to_enter, "Number of times a thread is allowed to enter InnoDB within the same SQL query after it has once got the ticket", NULL, NULL, 500L, 1L, ~0L, 0); +static MYSQL_SYSVAR_LONG(kill_idle_transaction, srv_kill_idle_transaction, + PLUGIN_VAR_RQCMDARG, +#ifdef EXTENDED_FOR_KILLIDLE + "If non-zero value, the idle session with transaction which is idle over the value in seconds is killed by InnoDB.", +#else + "No effect for this build.", +#endif + NULL, NULL, 0, 0, LONG_MAX, 0); + static MYSQL_SYSVAR_LONG(file_io_threads, innobase_file_io_threads, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY | PLUGIN_VAR_NOSYSVAR, "Number of file I/O threads in InnoDB.", @@ -11652,6 +11747,11 @@ static MYSQL_SYSVAR_UINT(change_buffering_debug, ibuf_debug, NULL, NULL, 0, 0, 1, 0); #endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */ +static MYSQL_SYSVAR_BOOL(random_read_ahead, srv_random_read_ahead, + PLUGIN_VAR_NOCMDARG, + "Whether to use read ahead for random access within an extent.", + NULL, NULL, FALSE); + static MYSQL_SYSVAR_ULONG(read_ahead_threshold, srv_read_ahead_threshold, PLUGIN_VAR_RQCMDARG, "Number of pages that must be accessed sequentially for InnoDB to " @@ -11775,13 +11875,19 @@ static MYSQL_SYSVAR_UINT(auto_lru_dump, srv_auto_lru_dump, "0 (the default) disables automatic dumps.", NULL, NULL, 0, 0, UINT_MAX32, 0); +static MYSQL_SYSVAR_BOOL(blocking_lru_restore, innobase_blocking_lru_restore, + PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY, + "Block XtraDB startup process until buffer pool is full restored from a " + "dump file (if present). Disabled by default.", + NULL, NULL, FALSE); + static MYSQL_SYSVAR_ULINT(pass_corrupt_table, srv_pass_corrupt_table, PLUGIN_VAR_RQCMDARG, "Pass corruptions of user tables as 'corrupt table' instead of not crashing itself, " "when used with file_per_table. " "All file io for the datafile after detected as corrupt are disabled, " "except for the deletion.", - NULL, NULL, 0, 0, 1, 0); + NULL, NULL, 0, 0, 2, 0); static MYSQL_SYSVAR_ULINT(lazy_drop_table, srv_lazy_drop_table, PLUGIN_VAR_RQCMDARG, @@ -11801,6 +11907,7 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(fast_checksum), MYSQL_SYSVAR(commit_concurrency), MYSQL_SYSVAR(concurrency_tickets), + MYSQL_SYSVAR(kill_idle_transaction), MYSQL_SYSVAR(data_file_path), MYSQL_SYSVAR(doublewrite_file), MYSQL_SYSVAR(data_home_dir), @@ -11875,12 +11982,15 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { #if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG MYSQL_SYSVAR(change_buffering_debug), #endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */ + MYSQL_SYSVAR(random_read_ahead), MYSQL_SYSVAR(read_ahead_threshold), MYSQL_SYSVAR(io_capacity), MYSQL_SYSVAR(auto_lru_dump), + MYSQL_SYSVAR(blocking_lru_restore), MYSQL_SYSVAR(use_purge_thread), MYSQL_SYSVAR(pass_corrupt_table), MYSQL_SYSVAR(lazy_drop_table), + MYSQL_SYSVAR(fake_changes), NULL }; diff --git a/storage/xtradb/handler/ha_innodb.h b/storage/xtradb/handler/ha_innodb.h index c60a5eae19e..bfe7432b32d 100644 --- a/storage/xtradb/handler/ha_innodb.h +++ b/storage/xtradb/handler/ha_innodb.h @@ -319,15 +319,14 @@ innobase_trx_allocate( This function checks each index name for a table against reserved system default primary index name 'GEN_CLUST_INDEX'. If a name matches, this function pushes an warning message to the client, -and returns true. */ +and returns true. +@return true if the index name matches the reserved name */ extern "C" bool innobase_index_name_is_reserved( /*============================*/ - /* out: true if the index name - matches the reserved name */ - const trx_t* trx, /* in: InnoDB transaction handle */ - const KEY* key_info, /* in: Indexes to be created */ - ulint num_of_keys); /* in: Number of indexes to + THD* thd, /*!< in/out: MySQL connection */ + const KEY* key_info, /*!< in: Indexes to be created */ + ulint num_of_keys); /*!< in: Number of indexes to be created. */ diff --git a/storage/xtradb/handler/handler0alter.cc b/storage/xtradb/handler/handler0alter.cc index c54b4a17fd2..37fddf71cbc 100644 --- a/storage/xtradb/handler/handler0alter.cc +++ b/storage/xtradb/handler/handler0alter.cc @@ -649,44 +649,47 @@ ha_innobase::add_index( update_thd(); - heap = mem_heap_create(1024); - /* In case MySQL calls this in the middle of a SELECT query, release possible adaptive hash latch to avoid deadlocks of threads. */ trx_search_latch_release_if_reserved(prebuilt->trx); - trx_start_if_not_started(prebuilt->trx); + if (prebuilt->trx->fake_changes) { + DBUG_RETURN(HA_ERR_WRONG_COMMAND); + } - /* Create a background transaction for the operations on - the data dictionary tables. */ - trx = innobase_trx_allocate(user_thd); - trx_start_if_not_started(trx); + /* Check if the index name is reserved. */ + if (innobase_index_name_is_reserved(user_thd, key_info, num_of_keys)) { + DBUG_RETURN(-1); + } innodb_table = indexed_table = dict_table_get(prebuilt->table->name, FALSE); if (UNIV_UNLIKELY(!innodb_table)) { - error = HA_ERR_NO_SUCH_TABLE; - goto err_exit; + DBUG_RETURN(HA_ERR_NO_SUCH_TABLE); } - /* Check if the index name is reserved. */ - if (innobase_index_name_is_reserved(trx, key_info, num_of_keys)) { - error = ER_WRONG_NAME_FOR_INDEX; - } else { - /* Check that index keys are sensible */ - error = innobase_check_index_keys(key_info, num_of_keys, - innodb_table); - } + /* Check that index keys are sensible */ + error = innobase_check_index_keys(key_info, num_of_keys, innodb_table); if (UNIV_UNLIKELY(error)) { -err_exit: + DBUG_RETURN(error); + } + + heap = mem_heap_create(1024); + trx_start_if_not_started(prebuilt->trx); + + /* Create a background transaction for the operations on + the data dictionary tables. */ + trx = innobase_trx_allocate(user_thd); + if (trx->fake_changes) { mem_heap_free(heap); trx_general_rollback_for_mysql(trx, NULL); trx_free_for_mysql(trx); - trx_commit_for_mysql(prebuilt->trx); - DBUG_RETURN(error); + DBUG_RETURN(HA_ERR_WRONG_COMMAND); } + trx_start_if_not_started(trx); + /* Create table containing all indexes to be built in this alter table add index so that they are in the correct order in the table. */ @@ -758,8 +761,12 @@ err_exit: ut_d(dict_table_check_for_dup_indexes(innodb_table, FALSE)); + mem_heap_free(heap); + trx_general_rollback_for_mysql(trx, NULL); row_mysql_unlock_data_dictionary(trx); - goto err_exit; + trx_free_for_mysql(trx); + trx_commit_for_mysql(prebuilt->trx); + DBUG_RETURN(error); } trx->table_id = indexed_table->id; @@ -782,10 +789,6 @@ err_exit: ut_ad(error == DB_SUCCESS); - /* We will need to rebuild index translation table. Set - valid index entry count in the translation table to zero */ - share->idx_trans_tbl.index_count = 0; - /* Commit the data dictionary transaction in order to release the table locks on the system tables. This means that if MySQL crashes while creating a new primary key inside @@ -911,6 +914,14 @@ error: } convert_error: + if (error == DB_SUCCESS) { + /* Build index is successful. We will need to + rebuild index translation table. Reset the + index entry count in the translation table + to zero, so that translation table will be rebuilt */ + share->idx_trans_tbl.index_count = 0; + } + error = convert_error_code_to_mysql(error, innodb_table->flags, user_thd); @@ -963,6 +974,10 @@ ha_innobase::prepare_drop_index( trx_search_latch_release_if_reserved(prebuilt->trx); trx = prebuilt->trx; + if (trx->fake_changes) { + DBUG_RETURN(HA_ERR_WRONG_COMMAND); + } + /* Test and mark all the indexes to be dropped */ row_mysql_lock_data_dictionary(trx); @@ -1167,6 +1182,12 @@ ha_innobase::final_drop_index( /* Create a background transaction for the operations on the data dictionary tables. */ trx = innobase_trx_allocate(user_thd); + if (trx->fake_changes) { + trx_general_rollback_for_mysql(trx, NULL); + trx_free_for_mysql(trx); + DBUG_RETURN(HA_ERR_WRONG_COMMAND); + } + trx_start_if_not_started(trx); /* Flag this transaction as a dictionary operation, so that diff --git a/storage/xtradb/handler/i_s.cc b/storage/xtradb/handler/i_s.cc index 11b4a4b4d60..fc7cd1c9b8f 100644 --- a/storage/xtradb/handler/i_s.cc +++ b/storage/xtradb/handler/i_s.cc @@ -953,7 +953,7 @@ UNIV_INTERN struct st_mysql_plugin i_s_innodb_buffer_pool_pages = /* plugin author (for SHOW PLUGINS) */ /* const char* */ - STRUCT_FLD(author, plugin_author), + STRUCT_FLD(author, "Percona"), /* general descriptive text (for SHOW PLUGINS) */ /* const char* */ @@ -1002,7 +1002,7 @@ UNIV_INTERN struct st_mysql_plugin i_s_innodb_buffer_pool_pages_index = /* plugin author (for SHOW PLUGINS) */ /* const char* */ - STRUCT_FLD(author, plugin_author), + STRUCT_FLD(author, "Percona"), /* general descriptive text (for SHOW PLUGINS) */ /* const char* */ @@ -1051,7 +1051,7 @@ UNIV_INTERN struct st_mysql_plugin i_s_innodb_buffer_pool_pages_blob = /* plugin author (for SHOW PLUGINS) */ /* const char* */ - STRUCT_FLD(author, plugin_author), + STRUCT_FLD(author, "Percona"), /* general descriptive text (for SHOW PLUGINS) */ /* const char* */ @@ -2573,7 +2573,7 @@ UNIV_INTERN struct st_mysql_plugin i_s_innodb_rseg = /* plugin author (for SHOW PLUGINS) */ /* const char* */ - STRUCT_FLD(author, plugin_author), + STRUCT_FLD(author, "Percona"), /* general descriptive text (for SHOW PLUGINS) */ /* const char* */ @@ -2608,6 +2608,172 @@ UNIV_INTERN struct st_mysql_plugin i_s_innodb_rseg = /*********************************************************************** */ +static ST_FIELD_INFO i_s_innodb_admin_command_info[] = +{ + {STRUCT_FLD(field_name, "result_message"), + STRUCT_FLD(field_length, 1024), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + END_OF_ST_FIELD_INFO +}; + +#ifndef INNODB_COMPATIBILITY_HOOKS +#error InnoDB needs MySQL to be built with #define INNODB_COMPATIBILITY_HOOKS +#endif + +extern "C" { +char **thd_query(MYSQL_THD thd); +} + +static +int +i_s_innodb_admin_command_fill( +/*==========================*/ + THD* thd, + TABLE_LIST* tables, + COND* cond) +{ + TABLE* i_s_table = (TABLE *) tables->table; + char** query_str; + char* ptr; + char quote = '\0'; + const char* command_head = "XTRA_"; + + DBUG_ENTER("i_s_innodb_admin_command_fill"); + + /* deny access to non-superusers */ + if (check_global_access(thd, PROCESS_ACL)) { + DBUG_RETURN(0); + } + + if(thd_sql_command(thd) != SQLCOM_SELECT) { + field_store_string(i_s_table->field[0], + "SELECT command is only accepted."); + goto end_func; + } + + query_str = thd_query(thd); + ptr = *query_str; + + for (; *ptr; ptr++) { + if (*ptr == quote) { + quote = '\0'; + } else if (quote) { + } else if (*ptr == '`' || *ptr == '"') { + quote = *ptr; + } else { + long i; + for (i = 0; command_head[i]; i++) { + if (toupper((int)(unsigned char)(ptr[i])) + != toupper((int)(unsigned char) + (command_head[i]))) { + goto nomatch; + } + } + break; +nomatch: + ; + } + } + + if (!*ptr) { + field_store_string(i_s_table->field[0], + "No XTRA_* command in the SQL statement." + " Please add /*!XTRA_xxxx*/ to the SQL."); + goto end_func; + } + + if (!strncasecmp("XTRA_HELLO", ptr, 10)) { + /* This is example command XTRA_HELLO */ + + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: administration command test for XtraDB" + " 'XTRA_HELLO' was detected.\n"); + + field_store_string(i_s_table->field[0], + "Hello!"); + goto end_func; + } + else if (!strncasecmp("XTRA_LRU_DUMP", ptr, 13)) { + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: administration command 'XTRA_LRU_DUMP'" + " was detected.\n"); + + if (buf_LRU_file_dump()) { + field_store_string(i_s_table->field[0], + "XTRA_LRU_DUMP was succeeded."); + } else { + field_store_string(i_s_table->field[0], + "XTRA_LRU_DUMP was failed."); + } + + goto end_func; + } + else if (!strncasecmp("XTRA_LRU_RESTORE", ptr, 16)) { + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: administration command 'XTRA_LRU_RESTORE'" + " was detected.\n"); + + if (buf_LRU_file_restore()) { + field_store_string(i_s_table->field[0], + "XTRA_LRU_RESTORE was succeeded."); + } else { + field_store_string(i_s_table->field[0], + "XTRA_LRU_RESTORE was failed."); + } + + goto end_func; + } + + field_store_string(i_s_table->field[0], + "Undefined XTRA_* command."); + goto end_func; + +end_func: + if (schema_table_store_record(thd, i_s_table)) { + DBUG_RETURN(1); + } else { + DBUG_RETURN(0); + } +} + +static +int +i_s_innodb_admin_command_init( +/*==========================*/ + void* p) +{ + DBUG_ENTER("i_s_innodb_admin_command_init"); + ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = i_s_innodb_admin_command_info; + schema->fill_table = i_s_innodb_admin_command_fill; + + DBUG_RETURN(0); +} + +UNIV_INTERN struct st_mysql_plugin i_s_innodb_admin_command = +{ + STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN), + STRUCT_FLD(info, &i_s_info), + STRUCT_FLD(name, "XTRADB_ADMIN_COMMAND"), + STRUCT_FLD(author, "Percona"), + STRUCT_FLD(descr, "XtraDB specific command acceptor"), + STRUCT_FLD(license, PLUGIN_LICENSE_GPL), + STRUCT_FLD(init, i_s_innodb_admin_command_init), + STRUCT_FLD(deinit, i_s_common_deinit), + STRUCT_FLD(version, 0x0100 /* 1.0 */), + STRUCT_FLD(status_vars, NULL), + STRUCT_FLD(system_vars, NULL), + STRUCT_FLD(__reserved1, NULL) +}; + +/*********************************************************************** +*/ static ST_FIELD_INFO i_s_innodb_table_stats_info[] = { {STRUCT_FLD(field_name, "table_schema"), @@ -2923,7 +3089,7 @@ UNIV_INTERN struct st_mysql_plugin i_s_innodb_table_stats = STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN), STRUCT_FLD(info, &i_s_info), STRUCT_FLD(name, "INNODB_TABLE_STATS"), - STRUCT_FLD(author, plugin_author), + STRUCT_FLD(author, "Percona"), STRUCT_FLD(descr, "InnoDB table statistics in memory"), STRUCT_FLD(license, PLUGIN_LICENSE_GPL), STRUCT_FLD(init, i_s_innodb_table_stats_init), @@ -2939,7 +3105,7 @@ UNIV_INTERN struct st_mysql_plugin i_s_innodb_index_stats = STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN), STRUCT_FLD(info, &i_s_info), STRUCT_FLD(name, "INNODB_INDEX_STATS"), - STRUCT_FLD(author, plugin_author), + STRUCT_FLD(author, "Percona"), STRUCT_FLD(descr, "InnoDB index statistics in memory"), STRUCT_FLD(license, PLUGIN_LICENSE_GPL), STRUCT_FLD(init, i_s_innodb_index_stats_init), @@ -2950,172 +3116,6 @@ UNIV_INTERN struct st_mysql_plugin i_s_innodb_index_stats = STRUCT_FLD(__reserved1, NULL) }; -/*********************************************************************** -*/ -static ST_FIELD_INFO i_s_innodb_admin_command_info[] = -{ - {STRUCT_FLD(field_name, "result_message"), - STRUCT_FLD(field_length, 1024), - STRUCT_FLD(field_type, MYSQL_TYPE_STRING), - STRUCT_FLD(value, 0), - STRUCT_FLD(field_flags, 0), - STRUCT_FLD(old_name, ""), - STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, - - END_OF_ST_FIELD_INFO -}; - -#ifndef INNODB_COMPATIBILITY_HOOKS -#error InnoDB needs MySQL to be built with #define INNODB_COMPATIBILITY_HOOKS -#endif - -extern "C" { -char **thd_query(MYSQL_THD thd); -} - -static -int -i_s_innodb_admin_command_fill( -/*==========================*/ - THD* thd, - TABLE_LIST* tables, - COND* cond) -{ - TABLE* i_s_table = (TABLE *) tables->table; - char** query_str; - char* ptr; - char quote = '\0'; - const char* command_head = "XTRA_"; - - DBUG_ENTER("i_s_innodb_admin_command_fill"); - - /* deny access to non-superusers */ - if (check_global_access(thd, PROCESS_ACL)) { - DBUG_RETURN(0); - } - - if(thd_sql_command(thd) != SQLCOM_SELECT) { - field_store_string(i_s_table->field[0], - "SELECT command is only accepted."); - goto end_func; - } - - query_str = thd_query(thd); - ptr = *query_str; - - for (; *ptr; ptr++) { - if (*ptr == quote) { - quote = '\0'; - } else if (quote) { - } else if (*ptr == '`' || *ptr == '"') { - quote = *ptr; - } else { - long i; - for (i = 0; command_head[i]; i++) { - if (toupper((int)(unsigned char)(ptr[i])) - != toupper((int)(unsigned char) - (command_head[i]))) { - goto nomatch; - } - } - break; -nomatch: - ; - } - } - - if (!*ptr) { - field_store_string(i_s_table->field[0], - "No XTRA_* command in the SQL statement." - " Please add /*!XTRA_xxxx*/ to the SQL."); - goto end_func; - } - - if (!strncasecmp("XTRA_HELLO", ptr, 10)) { - /* This is example command XTRA_HELLO */ - - ut_print_timestamp(stderr); - fprintf(stderr, " InnoDB: administration command test for XtraDB" - " 'XTRA_HELLO' was detected.\n"); - - field_store_string(i_s_table->field[0], - "Hello!"); - goto end_func; - } - else if (!strncasecmp("XTRA_LRU_DUMP", ptr, 13)) { - ut_print_timestamp(stderr); - fprintf(stderr, " InnoDB: administration command 'XTRA_LRU_DUMP'" - " was detected.\n"); - - if (buf_LRU_file_dump()) { - field_store_string(i_s_table->field[0], - "XTRA_LRU_DUMP was succeeded."); - } else { - field_store_string(i_s_table->field[0], - "XTRA_LRU_DUMP was failed."); - } - - goto end_func; - } - else if (!strncasecmp("XTRA_LRU_RESTORE", ptr, 16)) { - ut_print_timestamp(stderr); - fprintf(stderr, " InnoDB: administration command 'XTRA_LRU_RESTORE'" - " was detected.\n"); - - if (buf_LRU_file_restore()) { - field_store_string(i_s_table->field[0], - "XTRA_LRU_RESTORE was succeeded."); - } else { - field_store_string(i_s_table->field[0], - "XTRA_LRU_RESTORE was failed."); - } - - goto end_func; - } - - field_store_string(i_s_table->field[0], - "Undefined XTRA_* command."); - goto end_func; - -end_func: - if (schema_table_store_record(thd, i_s_table)) { - DBUG_RETURN(1); - } else { - DBUG_RETURN(0); - } -} - -static -int -i_s_innodb_admin_command_init( -/*==========================*/ - void* p) -{ - DBUG_ENTER("i_s_innodb_admin_command_init"); - ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p; - - schema->fields_info = i_s_innodb_admin_command_info; - schema->fill_table = i_s_innodb_admin_command_fill; - - DBUG_RETURN(0); -} - -UNIV_INTERN struct st_mysql_plugin i_s_innodb_admin_command = -{ - STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN), - STRUCT_FLD(info, &i_s_info), - STRUCT_FLD(name, "XTRADB_ADMIN_COMMAND"), - STRUCT_FLD(author, plugin_author), - STRUCT_FLD(descr, "XtraDB specific command acceptor"), - STRUCT_FLD(license, PLUGIN_LICENSE_GPL), - STRUCT_FLD(init, i_s_innodb_admin_command_init), - STRUCT_FLD(deinit, i_s_common_deinit), - STRUCT_FLD(version, 0x0100 /* 1.0 */), - STRUCT_FLD(status_vars, NULL), - STRUCT_FLD(system_vars, NULL), - STRUCT_FLD(__reserved1, NULL) -}; - static ST_FIELD_INFO i_s_innodb_sys_tables_info[] = { {STRUCT_FLD(field_name, "SCHEMA"), @@ -3651,15 +3651,14 @@ i_s_innodb_schema_table_fill( rec = btr_pcur_get_rec(&pcur); if (!btr_pcur_is_on_user_rec(&pcur)) { /* end of index */ - btr_pcur_close(&pcur); - mtr_commit(&mtr); break; } + + btr_pcur_store_position(&pcur, &mtr); + if (rec_get_deleted_flag(rec, 0)) { /* record marked as deleted */ - btr_pcur_close(&pcur); - mtr_commit(&mtr); - continue; + goto next_record; } if (id == 0) { @@ -3670,33 +3669,23 @@ i_s_innodb_schema_table_fill( status = copy_sys_stats_rec(table, index, rec); } if (status) { - btr_pcur_close(&pcur); - mtr_commit(&mtr); break; } -#if 0 - btr_pcur_store_position(&pcur, &mtr); - mtr_commit(&mtr); - status = schema_table_store_record(thd, table); if (status) { - btr_pcur_close(&pcur); break; } +next_record: + mtr_commit(&mtr); mtr_start(&mtr); btr_pcur_restore_position(BTR_SEARCH_LEAF, &pcur, &mtr); -#else - status = schema_table_store_record(thd, table); - if (status) { - btr_pcur_close(&pcur); - mtr_commit(&mtr); - break; - } -#endif } + btr_pcur_close(&pcur); + mtr_commit(&mtr); + mutex_exit(&(dict_sys->mutex)); DBUG_RETURN(status); @@ -3752,7 +3741,7 @@ UNIV_INTERN struct st_mysql_plugin i_s_innodb_sys_tables = STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN), STRUCT_FLD(info, &i_s_info), STRUCT_FLD(name, "INNODB_SYS_TABLES"), - STRUCT_FLD(author, plugin_author), + STRUCT_FLD(author, "Percona"), STRUCT_FLD(descr, "InnoDB SYS_TABLES table"), STRUCT_FLD(license, PLUGIN_LICENSE_GPL), STRUCT_FLD(init, i_s_innodb_sys_tables_init), @@ -3768,7 +3757,7 @@ UNIV_INTERN struct st_mysql_plugin i_s_innodb_sys_indexes = STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN), STRUCT_FLD(info, &i_s_info), STRUCT_FLD(name, "INNODB_SYS_INDEXES"), - STRUCT_FLD(author, plugin_author), + STRUCT_FLD(author, "Percona"), STRUCT_FLD(descr, "InnoDB SYS_INDEXES table"), STRUCT_FLD(license, PLUGIN_LICENSE_GPL), STRUCT_FLD(init, i_s_innodb_sys_indexes_init), @@ -3784,7 +3773,7 @@ UNIV_INTERN struct st_mysql_plugin i_s_innodb_sys_stats = STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN), STRUCT_FLD(info, &i_s_info), STRUCT_FLD(name, "INNODB_SYS_STATS"), - STRUCT_FLD(author, plugin_author), + STRUCT_FLD(author, "Percona"), STRUCT_FLD(descr, "InnoDB SYS_STATS table"), STRUCT_FLD(license, PLUGIN_LICENSE_GPL), STRUCT_FLD(init, i_s_innodb_sys_stats_init), diff --git a/storage/xtradb/handler/innodb_patch_info.h b/storage/xtradb/handler/innodb_patch_info.h index e68f12d0fec..38b97411340 100644 --- a/storage/xtradb/handler/innodb_patch_info.h +++ b/storage/xtradb/handler/innodb_patch_info.h @@ -47,6 +47,5 @@ struct innodb_enhancement { {"innodb_fast_checksum","Using the checksum on 32bit-unit calculation","incompatible for unpatched ver.","http://www.percona.com/docs/wiki/percona-xtradb"}, {"innodb_files_extend","allow >4GB transaction log files, and can vary universal page size of datafiles","incompatible for unpatched ver.","http://www.percona.com/docs/wiki/percona-xtradb"}, {"innodb_sys_tables_sys_indexes","Expose InnoDB SYS_TABLES and SYS_INDEXES schema tables","","http://www.percona.com/docs/wiki/percona-xtradb"}, -{"innodb_buffer_pool_shm","Put buffer pool contents to shared memory segment and reuse it at clean restart [experimental]","","http://www.percona.com/docs/wiki/percona-xtradb"}, {NULL, NULL, NULL, NULL} }; diff --git a/storage/xtradb/ibuf/ibuf0ibuf.c b/storage/xtradb/ibuf/ibuf0ibuf.c index 3f741da60bb..64dc9a5591d 100644 --- a/storage/xtradb/ibuf/ibuf0ibuf.c +++ b/storage/xtradb/ibuf/ibuf0ibuf.c @@ -2613,6 +2613,8 @@ ibuf_insert_low( ut_a(trx_sys_multiple_tablespace_format); + ut_ad(!(thr_get_trx(thr)->fake_changes)); + do_merge = FALSE; mutex_enter(&ibuf_mutex); diff --git a/storage/xtradb/include/btr0btr.h b/storage/xtradb/include/btr0btr.h index dde3a0bab69..1fe40965c0f 100644 --- a/storage/xtradb/include/btr0btr.h +++ b/storage/xtradb/include/btr0btr.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved. +Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -81,6 +81,91 @@ UNIQUE definition on secondary indexes when we decide if we can use the insert buffer to speed up inserts */ #define BTR_IGNORE_SEC_UNIQUE 2048 +#ifdef UNIV_BLOB_DEBUG +# include "ut0rbt.h" +/** An index->blobs entry for keeping track of off-page column references */ +struct btr_blob_dbg_struct +{ + unsigned blob_page_no:32; /*!< first BLOB page number */ + unsigned ref_page_no:32; /*!< referring page number */ + unsigned ref_heap_no:16; /*!< referring heap number */ + unsigned ref_field_no:10; /*!< referring field number */ + unsigned owner:1; /*!< TRUE if BLOB owner */ + unsigned always_owner:1; /*!< TRUE if always + has been the BLOB owner; + reset to TRUE on B-tree + page splits and merges */ + unsigned del:1; /*!< TRUE if currently + delete-marked */ +}; + +/**************************************************************//** +Add a reference to an off-page column to the index->blobs map. */ +UNIV_INTERN +void +btr_blob_dbg_add_blob( +/*==================*/ + const rec_t* rec, /*!< in: clustered index record */ + ulint field_no, /*!< in: number of off-page column */ + ulint page_no, /*!< in: start page of the column */ + dict_index_t* index, /*!< in/out: index tree */ + const char* ctx) /*!< in: context (for logging) */ + __attribute__((nonnull)); +/**************************************************************//** +Display the references to off-page columns. +This function is to be called from a debugger, +for example when a breakpoint on ut_dbg_assertion_failed is hit. */ +UNIV_INTERN +void +btr_blob_dbg_print( +/*===============*/ + const dict_index_t* index) /*!< in: index tree */ + __attribute__((nonnull)); +/**************************************************************//** +Check that there are no references to off-page columns from or to +the given page. Invoked when freeing or clearing a page. +@return TRUE when no orphan references exist */ +UNIV_INTERN +ibool +btr_blob_dbg_is_empty( +/*==================*/ + dict_index_t* index, /*!< in: index */ + ulint page_no) /*!< in: page number */ + __attribute__((nonnull, warn_unused_result)); + +/**************************************************************//** +Modify the 'deleted' flag of a record. */ +UNIV_INTERN +void +btr_blob_dbg_set_deleted_flag( +/*==========================*/ + const rec_t* rec, /*!< in: record */ + dict_index_t* index, /*!< in/out: index */ + const ulint* offsets,/*!< in: rec_get_offs(rec, index) */ + ibool del) /*!< in: TRUE=deleted, FALSE=exists */ + __attribute__((nonnull)); +/**************************************************************//** +Change the ownership of an off-page column. */ +UNIV_INTERN +void +btr_blob_dbg_owner( +/*===============*/ + const rec_t* rec, /*!< in: record */ + dict_index_t* index, /*!< in/out: index */ + const ulint* offsets,/*!< in: rec_get_offs(rec, index) */ + ulint i, /*!< in: ith field in rec */ + ibool own) /*!< in: TRUE=owned, FALSE=disowned */ + __attribute__((nonnull)); +/** Assert that there are no BLOB references to or from the given page. */ +# define btr_blob_dbg_assert_empty(index, page_no) \ + ut_a(btr_blob_dbg_is_empty(index, page_no)) +#else /* UNIV_BLOB_DEBUG */ +# define btr_blob_dbg_add_blob(rec, field_no, page, index, ctx) ((void) 0) +# define btr_blob_dbg_set_deleted_flag(rec, index, offsets, del)((void) 0) +# define btr_blob_dbg_owner(rec, index, offsets, i, val) ((void) 0) +# define btr_blob_dbg_assert_empty(index, page_no) ((void) 0) +#endif /* UNIV_BLOB_DEBUG */ + /**************************************************************//** Gets the root node of a tree and x-latches it. @return root page, x-latched */ @@ -123,6 +208,17 @@ btr_block_get_func( @return the uncompressed page frame */ # define btr_page_get(space,zip_size,page_no,mode,mtr) \ buf_block_get_frame(btr_block_get(space,zip_size,page_no,mode,mtr)) +/**************************************************************//** +Sets the index id field of a page. */ +UNIV_INLINE +void +btr_page_set_index_id( +/*==================*/ + page_t* page, /*!< in: page to be created */ + page_zip_des_t* page_zip,/*!< in: compressed page whose uncompressed + part will be updated, or NULL */ + dulint id, /*!< in: index id */ + mtr_t* mtr); /*!< in: mtr */ #endif /* !UNIV_HOTBACKUP */ /**************************************************************//** Gets the index id field of a page. @@ -160,6 +256,17 @@ btr_page_get_next( const page_t* page, /*!< in: index page */ mtr_t* mtr); /*!< in: mini-transaction handle */ /********************************************************//** +Sets the next index page field. */ +UNIV_INLINE +void +btr_page_set_next( +/*==============*/ + page_t* page, /*!< in: index page */ + page_zip_des_t* page_zip,/*!< in: compressed page whose uncompressed + part will be updated, or NULL */ + ulint next, /*!< in: next page number */ + mtr_t* mtr); /*!< in: mini-transaction handle */ +/********************************************************//** Gets the previous index page number. @return prev page number */ UNIV_INLINE @@ -168,6 +275,17 @@ btr_page_get_prev( /*==============*/ const page_t* page, /*!< in: index page */ mtr_t* mtr); /*!< in: mini-transaction handle */ +/********************************************************//** +Sets the previous index page field. */ +UNIV_INLINE +void +btr_page_set_prev( +/*==============*/ + page_t* page, /*!< in: index page */ + page_zip_des_t* page_zip,/*!< in: compressed page whose uncompressed + part will be updated, or NULL */ + ulint prev, /*!< in: previous page number */ + mtr_t* mtr); /*!< in: mini-transaction handle */ /*************************************************************//** Gets pointer to the previous user record in the tree. It is assumed that the caller has appropriate latches on the page and its neighbor. @@ -213,6 +331,18 @@ btr_node_ptr_get_child_page_no( /*===========================*/ const rec_t* rec, /*!< in: node pointer record */ const ulint* offsets);/*!< in: array returned by rec_get_offsets() */ +/**************************************************************//** +Creates a new index page (not the root, and also not +used in page reorganization). @see btr_page_empty(). */ +UNIV_INTERN +void +btr_page_create( +/*============*/ + buf_block_t* block, /*!< in/out: page to be created */ + page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */ + dict_index_t* index, /*!< in: index */ + ulint level, /*!< in: the B-tree level of the page */ + mtr_t* mtr); /*!< in: mtr */ /************************************************************//** Creates the root node for a new index tree. @return page number of the created root, FIL_NULL if did not succeed */ @@ -283,6 +413,17 @@ btr_page_reorganize( dict_index_t* index, /*!< in: record descriptor */ mtr_t* mtr); /*!< in: mtr */ /*************************************************************//** +Empties an index page. @see btr_page_create(). */ +UNIV_INTERN +void +btr_page_empty( +/*===========*/ + buf_block_t* block, /*!< in: page to be emptied */ + page_zip_des_t* page_zip,/*!< out: compressed page, or NULL */ + dict_index_t* index, /*!< in: index of the page */ + ulint level, /*!< in: the B-tree level of the page */ + mtr_t* mtr); /*!< in: mtr */ +/*************************************************************//** Decides if the page should be split at the convergence point of inserts converging to left. @return TRUE if split recommended */ @@ -341,6 +482,20 @@ btr_insert_on_non_leaf_level_func( # define btr_insert_on_non_leaf_level(i,l,t,m) \ btr_insert_on_non_leaf_level_func(i,l,t,__FILE__,__LINE__,m) #endif /* !UNIV_HOTBACKUP */ +/**************************************************************//** +Attaches the halves of an index page on the appropriate level in an +index tree. */ +UNIV_INTERN +void +btr_attach_half_pages( +/*==================*/ + dict_index_t* index, /*!< in: the index tree */ + buf_block_t* block, /*!< in/out: page to be split */ + const rec_t* split_rec, /*!< in: first record on upper + half page */ + buf_block_t* new_block, /*!< in/out: the new half page */ + ulint direction, /*!< in: FSP_UP or FSP_DOWN */ + mtr_t* mtr); /*!< in: mtr */ /****************************************************************//** Sets a record as the predefined minimum record. */ UNIV_INTERN @@ -385,11 +540,14 @@ UNIV_INTERN ibool btr_compress( /*=========*/ - btr_cur_t* cursor, /*!< in: cursor on the page to merge or lift; - the page must not be empty: in record delete - use btr_discard_page if the page would become - empty */ - mtr_t* mtr); /*!< in: mtr */ + btr_cur_t* cursor, /*!< in/out: cursor on the page to merge + or lift; the page must not be empty: + when deleting records, use btr_discard_page() + if the page would become empty */ + ibool adjust, /*!< in: TRUE if should adjust the + cursor position even if compression occurs */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + __attribute__((nonnull)); /*************************************************************//** Discards a page from a B-tree. This is used to remove the last record from a B-tree page: the whole page must be removed at the same time. This cannot diff --git a/storage/xtradb/include/btr0cur.h b/storage/xtradb/include/btr0cur.h index ece3621fa97..6f4ce95d72f 100644 --- a/storage/xtradb/include/btr0cur.h +++ b/storage/xtradb/include/btr0cur.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved. +Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -36,6 +36,9 @@ Created 10/16/1994 Heikki Tuuri #define BTR_NO_LOCKING_FLAG 2 /* do no record lock checking */ #define BTR_KEEP_SYS_FLAG 4 /* sys fields will be found from the update vector or inserted entry */ +#define BTR_KEEP_POS_FLAG 8 /* btr_cur_pessimistic_update() + must keep cursor position when + moving columns to big_rec */ #ifndef UNIV_HOTBACKUP #include "que0types.h" @@ -309,7 +312,9 @@ btr_cur_pessimistic_update( /*=======================*/ ulint flags, /*!< in: undo logging, locking, and rollback flags */ - btr_cur_t* cursor, /*!< in: cursor on the record to update */ + btr_cur_t* cursor, /*!< in/out: cursor on the record to update; + cursor may become invalid if *big_rec == NULL + || !(flags & BTR_KEEP_POS_FLAG) */ mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */ big_rec_t** big_rec,/*!< out: big rec vector whose fields have to be stored externally by the caller, or NULL */ @@ -321,6 +326,16 @@ btr_cur_pessimistic_update( que_thr_t* thr, /*!< in: query thread */ mtr_t* mtr); /*!< in: mtr; must be committed before latching any further pages */ +/***************************************************************** +Commits and restarts a mini-transaction so that it will retain an +x-lock on index->lock and the cursor page. */ +UNIV_INTERN +void +btr_cur_mtr_commit_and_start( +/*=========================*/ + btr_cur_t* cursor, /*!< in: cursor */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + __attribute__((nonnull)); /***********************************************************//** Marks a clustered index record deleted. Writes an undo log record to undo log on this delete marking. Writes in the trx id field the id @@ -376,10 +391,13 @@ UNIV_INTERN ibool btr_cur_compress_if_useful( /*=======================*/ - btr_cur_t* cursor, /*!< in: cursor on the page to compress; + btr_cur_t* cursor, /*!< in/out: cursor on the page to compress; cursor does not stay valid if compression occurs */ - mtr_t* mtr); /*!< in: mtr */ + ibool adjust, /*!< in: TRUE if should adjust the + cursor position even if compression occurs */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + __attribute__((nonnull)); /*******************************************************//** Removes the record on which the tree cursor is positioned. It is assumed that the mtr has an x-latch on the page where the cursor is positioned, @@ -652,6 +670,11 @@ struct btr_path_struct{ order); value ULINT_UNDEFINED denotes array end */ ulint n_recs; /*!< number of records on the page */ + ulint page_no; /*!< no of the page containing the record */ + ulint page_level; /*!< level of the page, if later we fetch + the page under page_no and it is no different + level then we know that the tree has been + reorganized */ }; #define BTR_PATH_ARRAY_N_SLOTS 250 /*!< size of path array (in slots) */ diff --git a/storage/xtradb/include/btr0cur.ic b/storage/xtradb/include/btr0cur.ic index 280583f6ccf..c833b3e8572 100644 --- a/storage/xtradb/include/btr0cur.ic +++ b/storage/xtradb/include/btr0cur.ic @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -139,7 +139,7 @@ btr_cur_compress_recommendation( btr_cur_t* cursor, /*!< in: btr cursor */ mtr_t* mtr) /*!< in: mtr */ { - page_t* page; + const page_t* page; ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor), MTR_MEMO_PAGE_X_FIX)); diff --git a/storage/xtradb/include/btr0pcur.h b/storage/xtradb/include/btr0pcur.h index 2334a266280..f59514d04b3 100644 --- a/storage/xtradb/include/btr0pcur.h +++ b/storage/xtradb/include/btr0pcur.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved. +Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -244,18 +244,6 @@ btr_pcur_restore_position_func( mtr_t* mtr); /*!< in: mtr */ #define btr_pcur_restore_position(l,cur,mtr) \ btr_pcur_restore_position_func(l,cur,__FILE__,__LINE__,mtr) -/**************************************************************//** -If the latch mode of the cursor is BTR_LEAF_SEARCH or BTR_LEAF_MODIFY, -releases the page latch and bufferfix reserved by the cursor. -NOTE! In the case of BTR_LEAF_MODIFY, there should not exist changes -made by the current mini-transaction to the data protected by the -cursor latch, as then the latch must not be released until mtr_commit. */ -UNIV_INTERN -void -btr_pcur_release_leaf( -/*==================*/ - btr_pcur_t* cursor, /*!< in: persistent cursor */ - mtr_t* mtr); /*!< in: mtr */ /*********************************************************//** Gets the rel_pos field for a cursor whose position has been stored. @return BTR_PCUR_ON, ... */ @@ -282,10 +270,9 @@ btr_pcur_get_mtr( btr_pcur_t* cursor); /*!< in: persistent cursor */ /**************************************************************//** Commits the mtr and sets the pcur latch mode to BTR_NO_LATCHES, -that is, the cursor becomes detached. If there have been modifications -to the page where pcur is positioned, this can be used instead of -btr_pcur_release_leaf. Function btr_pcur_store_position should be used -before calling this, if restoration of cursor is wanted later. */ +that is, the cursor becomes detached. +Function btr_pcur_store_position should be used before calling this, +if restoration of cursor is wanted later. */ UNIV_INLINE void btr_pcur_commit_specify_mtr( diff --git a/storage/xtradb/include/btr0pcur.ic b/storage/xtradb/include/btr0pcur.ic index 0c38797e6c5..0f9b969e7c5 100644 --- a/storage/xtradb/include/btr0pcur.ic +++ b/storage/xtradb/include/btr0pcur.ic @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved. +Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -396,10 +396,9 @@ btr_pcur_move_to_next( /**************************************************************//** Commits the mtr and sets the pcur latch mode to BTR_NO_LATCHES, -that is, the cursor becomes detached. If there have been modifications -to the page where pcur is positioned, this can be used instead of -btr_pcur_release_leaf. Function btr_pcur_store_position should be used -before calling this, if restoration of cursor is wanted later. */ +that is, the cursor becomes detached. +Function btr_pcur_store_position should be used before calling this, +if restoration of cursor is wanted later. */ UNIV_INLINE void btr_pcur_commit_specify_mtr( diff --git a/storage/xtradb/include/btr0types.h b/storage/xtradb/include/btr0types.h index ef4a6b04b34..07c06fb18d7 100644 --- a/storage/xtradb/include/btr0types.h +++ b/storage/xtradb/include/btr0types.h @@ -38,6 +38,131 @@ typedef struct btr_cur_struct btr_cur_t; /** B-tree search information for the adaptive hash index */ typedef struct btr_search_struct btr_search_t; +#ifdef UNIV_BLOB_DEBUG +# include "buf0types.h" +/** An index->blobs entry for keeping track of off-page column references */ +typedef struct btr_blob_dbg_struct btr_blob_dbg_t; + +/** Insert to index->blobs a reference to an off-page column. +@param index the index tree +@param b the reference +@param ctx context (for logging) */ +UNIV_INTERN +void +btr_blob_dbg_rbt_insert( +/*====================*/ + dict_index_t* index, /*!< in/out: index tree */ + const btr_blob_dbg_t* b, /*!< in: the reference */ + const char* ctx) /*!< in: context (for logging) */ + __attribute__((nonnull)); + +/** Remove from index->blobs a reference to an off-page column. +@param index the index tree +@param b the reference +@param ctx context (for logging) */ +UNIV_INTERN +void +btr_blob_dbg_rbt_delete( +/*====================*/ + dict_index_t* index, /*!< in/out: index tree */ + const btr_blob_dbg_t* b, /*!< in: the reference */ + const char* ctx) /*!< in: context (for logging) */ + __attribute__((nonnull)); + +/**************************************************************//** +Add to index->blobs any references to off-page columns from a record. +@return number of references added */ +UNIV_INTERN +ulint +btr_blob_dbg_add_rec( +/*=================*/ + const rec_t* rec, /*!< in: record */ + dict_index_t* index, /*!< in/out: index */ + const ulint* offsets,/*!< in: offsets */ + const char* ctx) /*!< in: context (for logging) */ + __attribute__((nonnull)); +/**************************************************************//** +Remove from index->blobs any references to off-page columns from a record. +@return number of references removed */ +UNIV_INTERN +ulint +btr_blob_dbg_remove_rec( +/*====================*/ + const rec_t* rec, /*!< in: record */ + dict_index_t* index, /*!< in/out: index */ + const ulint* offsets,/*!< in: offsets */ + const char* ctx) /*!< in: context (for logging) */ + __attribute__((nonnull)); +/**************************************************************//** +Count and add to index->blobs any references to off-page columns +from records on a page. +@return number of references added */ +UNIV_INTERN +ulint +btr_blob_dbg_add( +/*=============*/ + const page_t* page, /*!< in: rewritten page */ + dict_index_t* index, /*!< in/out: index */ + const char* ctx) /*!< in: context (for logging) */ + __attribute__((nonnull)); +/**************************************************************//** +Count and remove from index->blobs any references to off-page columns +from records on a page. +Used when reorganizing a page, before copying the records. +@return number of references removed */ +UNIV_INTERN +ulint +btr_blob_dbg_remove( +/*================*/ + const page_t* page, /*!< in: b-tree page */ + dict_index_t* index, /*!< in/out: index */ + const char* ctx) /*!< in: context (for logging) */ + __attribute__((nonnull)); +/**************************************************************//** +Restore in index->blobs any references to off-page columns +Used when page reorganize fails due to compressed page overflow. */ +UNIV_INTERN +void +btr_blob_dbg_restore( +/*=================*/ + const page_t* npage, /*!< in: page that failed to compress */ + const page_t* page, /*!< in: copy of original page */ + dict_index_t* index, /*!< in/out: index */ + const char* ctx) /*!< in: context (for logging) */ + __attribute__((nonnull)); + +/** Operation that processes the BLOB references of an index record +@param[in] rec record on index page +@param[in/out] index the index tree of the record +@param[in] offsets rec_get_offsets(rec,index) +@param[in] ctx context (for logging) +@return number of BLOB references processed */ +typedef ulint (*btr_blob_dbg_op_f) +(const rec_t* rec,dict_index_t* index,const ulint* offsets,const char* ctx); + +/**************************************************************//** +Count and process all references to off-page columns on a page. +@return number of references processed */ +UNIV_INTERN +ulint +btr_blob_dbg_op( +/*============*/ + const page_t* page, /*!< in: B-tree leaf page */ + const rec_t* rec, /*!< in: record to start from + (NULL to process the whole page) */ + dict_index_t* index, /*!< in/out: index */ + const char* ctx, /*!< in: context (for logging) */ + const btr_blob_dbg_op_f op) /*!< in: operation on records */ + __attribute__((nonnull(1,3,4,5))); +#else /* UNIV_BLOB_DEBUG */ +# define btr_blob_dbg_add_rec(rec, index, offsets, ctx) ((void) 0) +# define btr_blob_dbg_add(page, index, ctx) ((void) 0) +# define btr_blob_dbg_remove_rec(rec, index, offsets, ctx) ((void) 0) +# define btr_blob_dbg_remove(page, index, ctx) ((void) 0) +# define btr_blob_dbg_restore(npage, page, index, ctx) ((void) 0) +# define btr_blob_dbg_op(page, rec, index, ctx, op) ((void) 0) +#endif /* UNIV_BLOB_DEBUG */ + /** The size of a reference to data stored on a different page. The reference is stored at the end of the prefix of the field in the index record. */ diff --git a/storage/xtradb/include/buf0buddy.h b/storage/xtradb/include/buf0buddy.h index 3a35f8e46e9..f4c3da8692d 100644 --- a/storage/xtradb/include/buf0buddy.h +++ b/storage/xtradb/include/buf0buddy.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 2006, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -37,25 +37,20 @@ Created December 2006 by Marko Makela /**********************************************************************//** Allocate a block. The thread calling this function must hold buf_pool_mutex and must not hold buf_pool_zip_mutex or any -block->mutex. The buf_pool_mutex may only be released and reacquired -if lru != NULL. This function should only be used for allocating -compressed page frames or control blocks (buf_page_t). Allocated -control blocks must be properly initialized immediately after -buf_buddy_alloc() has returned the memory, before releasing -buf_pool_mutex. -@return allocated block, possibly NULL if lru == NULL */ +block->mutex. The buf_pool_mutex may be released and reacquired. +This function should only be used for allocating compressed page frames. +@return allocated block, never NULL */ UNIV_INLINE void* buf_buddy_alloc( /*============*/ - ulint size, /*!< in: block size, up to UNIV_PAGE_SIZE */ + ulint size, /*!< in: compressed page size + (between PAGE_ZIP_MIN_SIZE and UNIV_PAGE_SIZE) */ ibool* lru, /*!< in: pointer to a variable that will be assigned TRUE if storage was allocated from the LRU list - and buf_pool_mutex was temporarily released, - or NULL if the LRU list should not be used */ + and buf_pool_mutex was temporarily released */ ibool have_page_hash_mutex) - __attribute__((malloc)); - + __attribute__((malloc, nonnull)); /**********************************************************************//** Release a block. */ UNIV_INLINE diff --git a/storage/xtradb/include/buf0buddy.ic b/storage/xtradb/include/buf0buddy.ic index 69659fb69d6..63241311e1f 100644 --- a/storage/xtradb/include/buf0buddy.ic +++ b/storage/xtradb/include/buf0buddy.ic @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 2006, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -36,8 +36,8 @@ Created December 2006 by Marko Makela /**********************************************************************//** Allocate a block. The thread calling this function must hold buf_pool_mutex and must not hold buf_pool_zip_mutex or any block->mutex. -The buf_pool_mutex may only be released and reacquired if lru != NULL. -@return allocated block, possibly NULL if lru==NULL */ +The buf_pool_mutex may be released and reacquired. +@return allocated block, never NULL */ UNIV_INTERN void* buf_buddy_alloc_low( @@ -46,10 +46,9 @@ buf_buddy_alloc_low( or BUF_BUDDY_SIZES */ ibool* lru, /*!< in: pointer to a variable that will be assigned TRUE if storage was allocated from the LRU list - and buf_pool_mutex was temporarily released, - or NULL if the LRU list should not be used */ + and buf_pool_mutex was temporarily released */ ibool have_page_hash_mutex) - __attribute__((malloc)); + __attribute__((malloc, nonnull)); /**********************************************************************//** Deallocate a block. */ @@ -76,6 +75,8 @@ buf_buddy_get_slot( ulint i; ulint s; + ut_ad(size >= PAGE_ZIP_MIN_SIZE); + for (i = 0, s = BUF_BUDDY_LOW; s < size; i++, s <<= 1) { } @@ -86,27 +87,26 @@ buf_buddy_get_slot( /**********************************************************************//** Allocate a block. The thread calling this function must hold buf_pool_mutex and must not hold buf_pool_zip_mutex or any -block->mutex. The buf_pool_mutex may only be released and reacquired -if lru != NULL. This function should only be used for allocating -compressed page frames or control blocks (buf_page_t). Allocated -control blocks must be properly initialized immediately after -buf_buddy_alloc() has returned the memory, before releasing -buf_pool_mutex. -@return allocated block, possibly NULL if lru == NULL */ +block->mutex. The buf_pool_mutex may be released and reacquired. +This function should only be used for allocating compressed page frames. +@return allocated block, never NULL */ UNIV_INLINE void* buf_buddy_alloc( /*============*/ - ulint size, /*!< in: block size, up to UNIV_PAGE_SIZE */ + ulint size, /*!< in: compressed page size + (between PAGE_ZIP_MIN_SIZE and UNIV_PAGE_SIZE) */ ibool* lru, /*!< in: pointer to a variable that will be assigned TRUE if storage was allocated from the LRU list - and buf_pool_mutex was temporarily released, - or NULL if the LRU list should not be used */ + and buf_pool_mutex was temporarily released */ ibool have_page_hash_mutex) { //ut_ad(buf_pool_mutex_own()); + ut_ad(ut_is_2pow(size)); + ut_ad(size >= PAGE_ZIP_MIN_SIZE); + ut_ad(size <= UNIV_PAGE_SIZE); - return(buf_buddy_alloc_low(buf_buddy_get_slot(size), lru, have_page_hash_mutex)); + return((byte*) buf_buddy_alloc_low(buf_buddy_get_slot(size), lru, have_page_hash_mutex)); } /**********************************************************************//** @@ -121,6 +121,9 @@ buf_buddy_free( ibool have_page_hash_mutex) { //ut_ad(buf_pool_mutex_own()); + ut_ad(ut_is_2pow(size)); + ut_ad(size >= PAGE_ZIP_MIN_SIZE); + ut_ad(size <= UNIV_PAGE_SIZE); if (!have_page_hash_mutex) { mutex_enter(&LRU_list_mutex); diff --git a/storage/xtradb/include/buf0buf.h b/storage/xtradb/include/buf0buf.h index bc0e9170281..838dd7f3900 100644 --- a/storage/xtradb/include/buf0buf.h +++ b/storage/xtradb/include/buf0buf.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved. +Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -36,12 +36,13 @@ Created 11/5/1995 Heikki Tuuri #include "ut0rbt.h" #ifndef UNIV_HOTBACKUP #include "os0proc.h" -#include "srv0srv.h" /** @name Modes for buf_page_get_gen */ /* @{ */ #define BUF_GET 10 /*!< get always */ #define BUF_GET_IF_IN_POOL 11 /*!< get if in pool */ +#define BUF_PEEK_IF_IN_POOL 12 /*!< get if in pool, do not make + the block young in the LRU list */ #define BUF_GET_NO_LATCH 14 /*!< get and bufferfix, but set no latch; we have separated this case, because @@ -140,12 +141,6 @@ buf_relocate( BUF_BLOCK_ZIP_DIRTY or BUF_BLOCK_ZIP_PAGE */ buf_page_t* dpage) /*!< in/out: destination control block */ __attribute__((nonnull)); -/********************************************************************//** -Resizes the buffer pool. */ -UNIV_INTERN -void -buf_pool_resize(void); -/*=================*/ /*********************************************************************//** Gets the current size of buffer buf_pool in bytes. @return size in bytes */ @@ -162,6 +157,23 @@ ib_uint64_t buf_pool_get_oldest_modification(void); /*==================================*/ /********************************************************************//** +Allocates a buf_page_t descriptor. This function must succeed. In case +of failure we assert in this function. */ +UNIV_INLINE +buf_page_t* +buf_page_alloc_descriptor(void) +/*===========================*/ + __attribute__((malloc)); +/********************************************************************//** +Free a buf_page_t descriptor. */ +UNIV_INLINE +void +buf_page_free_descriptor( +/*=====================*/ + buf_page_t* bpage) /*!< in: bpage descriptor to free. */ + __attribute__((nonnull)); + +/********************************************************************//** Allocates a buffer block. @return own: the allocated block, in state BUF_BLOCK_MEMORY */ UNIV_INLINE @@ -285,7 +297,7 @@ buf_page_get_gen( ulint rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH */ buf_block_t* guess, /*!< in: guessed block or NULL */ ulint mode, /*!< in: BUF_GET, BUF_GET_IF_IN_POOL, - BUF_GET_NO_LATCH */ + BUF_PEEK_IF_IN_POOL, BUF_GET_NO_LATCH */ const char* file, /*!< in: file name */ ulint line, /*!< in: line where called */ mtr_t* mtr); /*!< in: mini-transaction */ @@ -415,6 +427,18 @@ buf_block_get_freed_page_clock( __attribute__((pure)); /********************************************************************//** +Tells if a block is still close enough to the MRU end of the LRU list +meaning that it is not in danger of getting evicted and also implying +that it has been accessed recently. +Note that this is for heuristics only and does not reserve buffer pool +mutex. +@return TRUE if block is close to MRU end of LRU */ +UNIV_INLINE +ibool +buf_page_peek_if_young( +/*===================*/ + const buf_page_t* bpage); /*!< in: block */ +/********************************************************************//** Recommends a move of a block to the start of the LRU list if there is danger of dropping from the buffer pool. NOTE: does not reserve the buffer pool mutex. @@ -466,6 +490,31 @@ buf_block_get_modify_clock( #else /* !UNIV_HOTBACKUP */ # define buf_block_modify_clock_inc(block) ((void) 0) #endif /* !UNIV_HOTBACKUP */ +/*******************************************************************//** +Increments the bufferfix count. */ +UNIV_INLINE +void +buf_block_buf_fix_inc_func( +/*=======================*/ +#ifdef UNIV_SYNC_DEBUG + const char* file, /*!< in: file name */ + ulint line, /*!< in: line */ +#endif /* UNIV_SYNC_DEBUG */ + buf_block_t* block) /*!< in/out: block to bufferfix */ + __attribute__((nonnull)); +#ifdef UNIV_SYNC_DEBUG +/** Increments the bufferfix count. +@param b in/out: block to bufferfix +@param f in: file name where requested +@param l in: line number where requested */ +# define buf_block_buf_fix_inc(b,f,l) buf_block_buf_fix_inc_func(f,l,b) +#else /* UNIV_SYNC_DEBUG */ +/** Increments the bufferfix count. +@param b in/out: block to bufferfix +@param f in: file name where requested +@param l in: line number where requested */ +# define buf_block_buf_fix_inc(b,f,l) buf_block_buf_fix_inc_func(b) +#endif /* UNIV_SYNC_DEBUG */ /********************************************************************//** Calculates a page checksum which is stored to the page when it is written to a file. Note that we must be careful to calculate the same value @@ -986,8 +1035,7 @@ UNIV_INTERN void buf_page_io_complete( /*=================*/ - buf_page_t* bpage, /*!< in: pointer to the block in question */ - trx_t* trx); + buf_page_t* bpage); /*!< in: pointer to the block in question */ /********************************************************************//** Calculates a folded value of a file page address to use in the page hash table. @@ -1301,10 +1349,7 @@ struct buf_block_struct{ /**********************************************************************//** Compute the hash fold value for blocks in buf_pool->zip_hash. */ /* @{ */ -/* the fold should be relative when srv_buffer_pool_shm_key is enabled */ -#define BUF_POOL_ZIP_FOLD_PTR(ptr) (!srv_buffer_pool_shm_key\ - ?((ulint) (ptr) / UNIV_PAGE_SIZE)\ - :((ulint) ((byte*)ptr - (byte*)(buf_pool->chunks->blocks->frame)) / UNIV_PAGE_SIZE)) +#define BUF_POOL_ZIP_FOLD_PTR(ptr) ((ulint) (ptr) / UNIV_PAGE_SIZE) #define BUF_POOL_ZIP_FOLD(b) BUF_POOL_ZIP_FOLD_PTR((b)->frame) #define BUF_POOL_ZIP_FOLD_BPAGE(b) BUF_POOL_ZIP_FOLD((buf_block_t*) (b)) /* @} */ @@ -1330,6 +1375,8 @@ struct buf_pool_stat_struct{ ulint n_pages_written;/*!< number write operations */ ulint n_pages_created;/*!< number of pages created in the pool with no read */ + ulint n_ra_pages_read_rnd;/*!< number of pages read in + as part of random read ahead */ ulint n_ra_pages_read;/*!< number of pages read in as part of read ahead */ ulint n_ra_pages_evicted;/*!< number of read ahead @@ -1453,8 +1500,10 @@ struct buf_pool_struct{ frames and buf_page_t descriptors of blocks that exist in the buffer pool only in compressed form. */ /* @{ */ +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG UT_LIST_BASE_NODE_T(buf_page_t) zip_clean; /*!< unmodified compressed pages */ +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ UT_LIST_BASE_NODE_T(buf_page_t) zip_free[BUF_BUDDY_SIZES_MAX]; /*!< buddy free lists */ //#if BUF_BUDDY_HIGH != UNIV_PAGE_SIZE @@ -1486,8 +1535,8 @@ Use these instead of accessing buf_pool_mutex directly. */ /** Test if buf_pool_mutex is owned. */ #define buf_pool_mutex_own() mutex_own(&buf_pool_mutex) /** Acquire the buffer pool mutex. */ +/* the buf_pool_mutex is changed the latch order */ #define buf_pool_mutex_enter() do { \ - ut_ad(!mutex_own(&buf_pool_zip_mutex)); \ mutex_enter(&buf_pool_mutex); \ } while (0) diff --git a/storage/xtradb/include/buf0buf.ic b/storage/xtradb/include/buf0buf.ic index 2cb0d8ef497..a081d6a34c0 100644 --- a/storage/xtradb/include/buf0buf.ic +++ b/storage/xtradb/include/buf0buf.ic @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved. +Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2008, Google Inc. Portions of this file contain modifications contributed and copyrighted by @@ -62,6 +62,27 @@ buf_block_get_freed_page_clock( } /********************************************************************//** +Tells if a block is still close enough to the MRU end of the LRU list +meaning that it is not in danger of getting evicted and also implying +that it has been accessed recently. +Note that this is for heuristics only and does not reserve buffer pool +mutex. +@return TRUE if block is close to MRU end of LRU */ +UNIV_INLINE +ibool +buf_page_peek_if_young( +/*===================*/ + const buf_page_t* bpage) /*!< in: block */ +{ + /* FIXME: bpage->freed_page_clock is 31 bits */ + return((buf_pool->freed_page_clock & ((1UL << 31) - 1)) + < ((ulint) bpage->freed_page_clock + + (buf_pool->curr_size + * (BUF_LRU_OLD_RATIO_DIV - buf_LRU_old_ratio) + / (BUF_LRU_OLD_RATIO_DIV * 4)))); +} + +/********************************************************************//** Recommends a move of a block to the start of the LRU list if there is danger of dropping from the buffer pool. NOTE: does not reserve the buffer pool mutex. @@ -89,12 +110,7 @@ buf_page_peek_if_too_old( buf_pool->stat.n_pages_not_made_young++; return(FALSE); } else { - /* FIXME: bpage->freed_page_clock is 31 bits */ - return((buf_pool->freed_page_clock & ((1UL << 31) - 1)) - > ((ulint) bpage->freed_page_clock - + (buf_pool->curr_size - * (BUF_LRU_OLD_RATIO_DIV - buf_LRU_old_ratio) - / (BUF_LRU_OLD_RATIO_DIV * 4)))); + return(!buf_page_peek_if_young(bpage)); } } @@ -754,6 +770,35 @@ buf_block_get_lock_hash_val( } /********************************************************************//** +Allocates a buf_page_t descriptor. This function must succeed. In case +of failure we assert in this function. +@return: the allocated descriptor. */ +UNIV_INLINE +buf_page_t* +buf_page_alloc_descriptor(void) +/*===========================*/ +{ + buf_page_t* bpage; + + bpage = (buf_page_t*) ut_malloc(sizeof *bpage); + ut_d(memset(bpage, 0, sizeof *bpage)); + UNIV_MEM_ALLOC(bpage, sizeof *bpage); + + return(bpage); +} + +/********************************************************************//** +Free a buf_page_t descriptor. */ +UNIV_INLINE +void +buf_page_free_descriptor( +/*=====================*/ + buf_page_t* bpage) /*!< in: bpage descriptor to free. */ +{ + ut_free(bpage); +} + +/********************************************************************//** Allocates a buffer block. @return own: the allocated block, in state BUF_BLOCK_MEMORY */ UNIV_INLINE @@ -910,19 +955,6 @@ buf_block_buf_fix_inc_func( block->page.buf_fix_count++; } -#ifdef UNIV_SYNC_DEBUG -/** Increments the bufferfix count. -@param b in/out: block to bufferfix -@param f in: file name where requested -@param l in: line number where requested */ -# define buf_block_buf_fix_inc(b,f,l) buf_block_buf_fix_inc_func(f,l,b) -#else /* UNIV_SYNC_DEBUG */ -/** Increments the bufferfix count. -@param b in/out: block to bufferfix -@param f in: file name where requested -@param l in: line number where requested */ -# define buf_block_buf_fix_inc(b,f,l) buf_block_buf_fix_inc_func(b) -#endif /* UNIV_SYNC_DEBUG */ /*******************************************************************//** Decrements the bufferfix count. */ @@ -1119,7 +1151,7 @@ buf_block_dbg_add_level( where we have acquired latch */ ulint level) /*!< in: latching order level */ { - sync_thread_add_level(&block->lock, level); + sync_thread_add_level(&block->lock, level, FALSE); } #endif /* UNIV_SYNC_DEBUG */ #endif /* !UNIV_HOTBACKUP */ diff --git a/storage/xtradb/include/buf0lru.h b/storage/xtradb/include/buf0lru.h index fe7c067dfb7..8abebfb675c 100644 --- a/storage/xtradb/include/buf0lru.h +++ b/storage/xtradb/include/buf0lru.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -30,18 +30,6 @@ Created 11/5/1995 Heikki Tuuri #include "ut0byte.h" #include "buf0types.h" -/** The return type of buf_LRU_free_block() */ -enum buf_lru_free_block_status { - /** freed */ - BUF_LRU_FREED = 0, - /** not freed because the caller asked to remove the - uncompressed frame but the control block cannot be - relocated */ - BUF_LRU_CANNOT_RELOCATE, - /** not freed because of some other reason */ - BUF_LRU_NOT_FREED -}; - /******************************************************************//** Tries to remove LRU flushed blocks from the end of the LRU list and put them to the free list. This is beneficial for the efficiency of the insert buffer @@ -91,6 +79,7 @@ void buf_LRU_mark_space_was_deleted( /*===========================*/ ulint id); /*!< in: space id */ +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG /********************************************************************//** Insert a compressed block into buf_pool->zip_clean in the LRU order. */ UNIV_INTERN @@ -98,22 +87,22 @@ void buf_LRU_insert_zip_clean( /*=====================*/ buf_page_t* bpage); /*!< in: pointer to the block in question */ +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ /******************************************************************//** Try to free a block. If bpage is a descriptor of a compressed-only page, the descriptor object will be freed as well. -NOTE: If this function returns BUF_LRU_FREED, it will temporarily +NOTE: If this function returns TRUE, it will temporarily release buf_pool_mutex. Furthermore, the page frame will no longer be accessible via bpage. The caller must hold buf_pool_mutex and buf_page_get_mutex(bpage) and release these two mutexes after the call. No other buf_page_get_mutex() may be held when calling this function. -@return BUF_LRU_FREED if freed, BUF_LRU_CANNOT_RELOCATE or -BUF_LRU_NOT_FREED otherwise. */ +@return TRUE if freed, FALSE otherwise. */ UNIV_INTERN -enum buf_lru_free_block_status +ibool buf_LRU_free_block( /*===============*/ buf_page_t* bpage, /*!< in: block to be freed */ diff --git a/storage/xtradb/include/buf0types.h b/storage/xtradb/include/buf0types.h index ce3e5ecc9c5..9dd847bdaca 100644 --- a/storage/xtradb/include/buf0types.h +++ b/storage/xtradb/include/buf0types.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -26,6 +26,8 @@ Created 11/17/1995 Heikki Tuuri #ifndef buf0types_h #define buf0types_h +#include "page0types.h" + /** Buffer page (uncompressed or compressed) */ typedef struct buf_page_struct buf_page_t; /** Buffer block for which an uncompressed page exists */ @@ -58,17 +60,10 @@ enum buf_io_fix { /** Parameters of binary buddy system for compressed pages (buf0buddy.h) */ /* @{ */ -#if UNIV_WORD_SIZE <= 4 /* 32-bit system */ -/** Base-2 logarithm of the smallest buddy block size */ -# define BUF_BUDDY_LOW_SHIFT 6 -#else /* 64-bit system */ -/** Base-2 logarithm of the smallest buddy block size */ -# define BUF_BUDDY_LOW_SHIFT 7 -#endif +#define BUF_BUDDY_LOW_SHIFT PAGE_ZIP_MIN_SIZE_SHIFT + #define BUF_BUDDY_LOW (1 << BUF_BUDDY_LOW_SHIFT) - /*!< minimum block size in the binary - buddy system; must be at least - sizeof(buf_page_t) */ + #define BUF_BUDDY_SIZES (UNIV_PAGE_SIZE_SHIFT - BUF_BUDDY_LOW_SHIFT) #define BUF_BUDDY_SIZES_MAX (UNIV_PAGE_SIZE_SHIFT_MAX - BUF_BUDDY_LOW_SHIFT) /*!< number of buddy sizes */ diff --git a/storage/xtradb/include/dict0mem.h b/storage/xtradb/include/dict0mem.h index f47293bedf6..3554274847c 100644 --- a/storage/xtradb/include/dict0mem.h +++ b/storage/xtradb/include/dict0mem.h @@ -340,6 +340,13 @@ struct dict_index_struct{ index, or 0 if the index existed when InnoDB was started up */ #endif /* !UNIV_HOTBACKUP */ +#ifdef UNIV_BLOB_DEBUG + mutex_t blobs_mutex; + /*!< mutex protecting blobs */ + void* blobs; /*!< map of (page_no,heap_no,field_no) + to first_blob_page_no; protected by + blobs_mutex; @see btr_blob_dbg_t */ +#endif /* UNIV_BLOB_DEBUG */ #ifdef UNIV_DEBUG ulint magic_n;/*!< magic number */ /** Value of dict_index_struct::magic_n */ diff --git a/storage/xtradb/include/fil0fil.h b/storage/xtradb/include/fil0fil.h index fbf8ca20db3..11c4cb4ba03 100644 --- a/storage/xtradb/include/fil0fil.h +++ b/storage/xtradb/include/fil0fil.h @@ -670,8 +670,9 @@ UNIV_INTERN void fil_flush( /*======*/ - ulint space_id); /*!< in: file space id (this can be a group of + ulint space_id, /*!< in: file space id (this can be a group of log files or a tablespace of the database) */ + ibool metadata); /**********************************************************************//** Flushes to disk writes in file spaces of the given type possibly cached by the OS. */ diff --git a/storage/xtradb/include/hash0hash.h b/storage/xtradb/include/hash0hash.h index 492c767acc4..b17c21a45ef 100644 --- a/storage/xtradb/include/hash0hash.h +++ b/storage/xtradb/include/hash0hash.h @@ -49,28 +49,6 @@ hash_table_t* hash_create( /*========*/ ulint n); /*!< in: number of array cells */ - -/*************************************************************//** -*/ -UNIV_INTERN -ulint -hash_create_needed( -/*===============*/ - ulint n); - -UNIV_INTERN -void -hash_create_init( -/*=============*/ - hash_table_t* table, - ulint n); - -UNIV_INTERN -void -hash_create_reuse( -/*==============*/ - hash_table_t* table); - #ifndef UNIV_HOTBACKUP /*************************************************************//** Creates a mutex array to protect a hash table. */ @@ -350,33 +328,6 @@ do {\ }\ } while (0) -/********************************************************************//** -Align nodes with moving location.*/ -#define HASH_OFFSET(TABLE, NODE_TYPE, PTR_NAME, FADDR, FOFFSET, BOFFSET) \ -do {\ - ulint i2222;\ - ulint cell_count2222;\ -\ - cell_count2222 = hash_get_n_cells(TABLE);\ -\ - for (i2222 = 0; i2222 < cell_count2222; i2222++) {\ - NODE_TYPE* node2222;\ -\ - if ((TABLE)->array[i2222].node) \ - (TABLE)->array[i2222].node = (void*)((byte*)(TABLE)->array[i2222].node \ - + (((TABLE)->array[i2222].node > (void*)FADDR)?FOFFSET:BOFFSET));\ - node2222 = HASH_GET_FIRST((TABLE), i2222);\ -\ - while (node2222) {\ - if (node2222->PTR_NAME) \ - node2222->PTR_NAME = (void*)((byte*)(node2222->PTR_NAME) \ - + ((((void*)node2222->PTR_NAME) > (void*)FADDR)?FOFFSET:BOFFSET));\ -\ - node2222 = node2222->PTR_NAME;\ - }\ - }\ -} while (0) - /************************************************************//** Gets the mutex index for a fold value in a hash table. @return mutex number */ diff --git a/storage/xtradb/include/mtr0mtr.h b/storage/xtradb/include/mtr0mtr.h index bc3f1951be9..8a9ec8ea7f0 100644 --- a/storage/xtradb/include/mtr0mtr.h +++ b/storage/xtradb/include/mtr0mtr.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -213,16 +213,6 @@ ulint mtr_set_savepoint( /*==============*/ mtr_t* mtr); /*!< in: mtr */ -/**********************************************************//** -Releases the latches stored in an mtr memo down to a savepoint. -NOTE! The mtr must not have made changes to buffer pages after the -savepoint, as these can be handled only by mtr_commit. */ -UNIV_INTERN -void -mtr_rollback_to_savepoint( -/*======================*/ - mtr_t* mtr, /*!< in: mtr */ - ulint savepoint); /*!< in: savepoint */ #ifndef UNIV_HOTBACKUP /**********************************************************//** Releases the (index tree) s-latch stored in an mtr memo after a diff --git a/storage/xtradb/include/os0file.h b/storage/xtradb/include/os0file.h index 732e930517b..98cab5ef874 100644 --- a/storage/xtradb/include/os0file.h +++ b/storage/xtradb/include/os0file.h @@ -469,7 +469,8 @@ UNIV_INTERN ibool os_file_flush( /*==========*/ - os_file_t file); /*!< in, own: handle to a file */ + os_file_t file, /*!< in, own: handle to a file */ + ibool metadata); /***********************************************************************//** Retrieves the last error number if an error occurs in a file io function. The number should be retrieved before any other OS calls (because they may diff --git a/storage/xtradb/include/os0proc.h b/storage/xtradb/include/os0proc.h index 582cef6f803..fd46bd7db87 100644 --- a/storage/xtradb/include/os0proc.h +++ b/storage/xtradb/include/os0proc.h @@ -32,11 +32,6 @@ Created 9/30/1995 Heikki Tuuri #ifdef UNIV_LINUX #include <sys/ipc.h> #include <sys/shm.h> -#else -# if defined HAVE_SYS_IPC_H && HAVE_SYS_SHM_H -#include <sys/ipc.h> -#include <sys/shm.h> -# endif #endif typedef void* os_process_t; @@ -75,29 +70,6 @@ os_mem_free_large( ulint size); /*!< in: size returned by os_mem_alloc_large() */ - -/****************************************************************//** -Allocates or attaches and reuses shared memory segment. -The content is not cleared automatically. -@return allocated memory */ -UNIV_INTERN -void* -os_shm_alloc( -/*=========*/ - ulint* n, /*!< in/out: number of bytes */ - uint key, - ibool* is_new); - -/****************************************************************//** -Detach shared memory segment. */ -UNIV_INTERN -void -os_shm_free( -/*========*/ - void *ptr, /*!< in: pointer returned by - os_shm_alloc() */ - ulint size); /*!< in: size returned by - os_shm_alloc() */ #ifndef UNIV_NONINL #include "os0proc.ic" #endif diff --git a/storage/xtradb/include/page0cur.ic b/storage/xtradb/include/page0cur.ic index 3520677dfb3..81474fa35f5 100644 --- a/storage/xtradb/include/page0cur.ic +++ b/storage/xtradb/include/page0cur.ic @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -27,6 +27,8 @@ Created 10/4/1994 Heikki Tuuri #include "buf0types.h" #ifdef UNIV_DEBUG +# include "rem0cmp.h" + /*********************************************************//** Gets pointer to the page frame where the cursor is positioned. @return page */ @@ -268,6 +270,7 @@ page_cur_tuple_insert( index, rec, offsets, mtr); } + ut_ad(!rec || !cmp_dtuple_rec(tuple, rec, offsets)); mem_heap_free(heap); return(rec); } diff --git a/storage/xtradb/include/page0page.h b/storage/xtradb/include/page0page.h index 5b2bcf7c054..aeaef030505 100644 --- a/storage/xtradb/include/page0page.h +++ b/storage/xtradb/include/page0page.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -284,16 +284,42 @@ page_get_supremum_offset( const page_t* page); /*!< in: page which must have record(s) */ #define page_get_infimum_rec(page) ((page) + page_get_infimum_offset(page)) #define page_get_supremum_rec(page) ((page) + page_get_supremum_offset(page)) + /************************************************************//** -Returns the middle record of record list. If there are an even number -of records in the list, returns the first record of upper half-list. -@return middle record */ +Returns the nth record of the record list. +This is the inverse function of page_rec_get_n_recs_before(). +@return nth record */ UNIV_INTERN +const rec_t* +page_rec_get_nth_const( +/*===================*/ + const page_t* page, /*!< in: page */ + ulint nth) /*!< in: nth record */ + __attribute__((nonnull, warn_unused_result)); +/************************************************************//** +Returns the nth record of the record list. +This is the inverse function of page_rec_get_n_recs_before(). +@return nth record */ +UNIV_INLINE +rec_t* +page_rec_get_nth( +/*=============*/ + page_t* page, /*< in: page */ + ulint nth) /*!< in: nth record */ + __attribute__((nonnull, warn_unused_result)); + +#ifndef UNIV_HOTBACKUP +/************************************************************//** +Returns the middle record of the records on the page. If there is an +even number of records in the list, returns the first record of the +upper half-list. +@return middle record */ +UNIV_INLINE rec_t* page_get_middle_rec( /*================*/ - page_t* page); /*!< in: page */ -#ifndef UNIV_HOTBACKUP + page_t* page) /*!< in: page */ + __attribute__((nonnull, warn_unused_result)); /*************************************************************//** Compares a data tuple to a physical record. Differs from the function cmp_dtuple_rec_with_match in the way that the record must reside on an @@ -348,6 +374,7 @@ page_get_n_recs( /***************************************************************//** Returns the number of records before the given record in chain. The number includes infimum and supremum records. +This is the inverse function of page_rec_get_nth(). @return number of records */ UNIV_INTERN ulint diff --git a/storage/xtradb/include/page0page.ic b/storage/xtradb/include/page0page.ic index dab9dc742e4..b34408aed17 100644 --- a/storage/xtradb/include/page0page.ic +++ b/storage/xtradb/include/page0page.ic @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -423,7 +423,37 @@ page_rec_is_infimum( return(page_rec_is_infimum_low(page_offset(rec))); } +/************************************************************//** +Returns the nth record of the record list. +This is the inverse function of page_rec_get_n_recs_before(). +@return nth record */ +UNIV_INLINE +rec_t* +page_rec_get_nth( +/*=============*/ + page_t* page, /*!< in: page */ + ulint nth) /*!< in: nth record */ +{ + return((rec_t*) page_rec_get_nth_const(page, nth)); +} + #ifndef UNIV_HOTBACKUP +/************************************************************//** +Returns the middle record of the records on the page. If there is an +even number of records in the list, returns the first record of the +upper half-list. +@return middle record */ +UNIV_INLINE +rec_t* +page_get_middle_rec( +/*================*/ + page_t* page) /*!< in: page */ +{ + ulint middle = (page_get_n_recs(page) + PAGE_HEAP_NO_USER_LOW) / 2; + + return(page_rec_get_nth(page, middle)); +} + /*************************************************************//** Compares a data tuple to a physical record. Differs from the function cmp_dtuple_rec_with_match in the way that the record must reside on an diff --git a/storage/xtradb/include/page0zip.h b/storage/xtradb/include/page0zip.h index 4d37302ed20..fe3d2e52e0b 100644 --- a/storage/xtradb/include/page0zip.h +++ b/storage/xtradb/include/page0zip.h @@ -420,7 +420,7 @@ page_zip_copy_recs( const page_t* src, /*!< in: page */ dict_index_t* index, /*!< in: index of the B-tree */ mtr_t* mtr) /*!< in: mini-transaction */ - __attribute__((nonnull(1,2,3,4))); + __attribute__((nonnull)); #endif /* !UNIV_HOTBACKUP */ /**********************************************************************//** diff --git a/storage/xtradb/include/rem0rec.h b/storage/xtradb/include/rem0rec.h index 17d08afabb9..06de23be757 100644 --- a/storage/xtradb/include/rem0rec.h +++ b/storage/xtradb/include/rem0rec.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -480,6 +480,18 @@ ulint rec_offs_any_extern( /*================*/ const ulint* offsets);/*!< in: array returned by rec_get_offsets() */ +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG +/******************************************************//** +Determine if the offsets are for a record containing null BLOB pointers. +@return first field containing a null BLOB pointer, or NULL if none found */ +UNIV_INLINE +const byte* +rec_offs_any_null_extern( +/*=====================*/ + const rec_t* rec, /*!< in: record */ + const ulint* offsets) /*!< in: rec_get_offsets(rec) */ + __attribute__((nonnull, warn_unused_result)); +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ /******************************************************//** Returns nonzero if the extern bit is set in nth field of rec. @return nonzero if externally stored */ diff --git a/storage/xtradb/include/rem0rec.ic b/storage/xtradb/include/rem0rec.ic index fa96c97f95e..3875a4cf814 100644 --- a/storage/xtradb/include/rem0rec.ic +++ b/storage/xtradb/include/rem0rec.ic @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -26,6 +26,7 @@ Created 5/30/1994 Heikki Tuuri #include "mach0data.h" #include "ut0byte.h" #include "dict0dict.h" +#include "btr0types.h" /* Compact flag ORed to the extra size returned by rec_get_offsets() */ #define REC_OFFS_COMPACT ((ulint) 1 << 31) @@ -1087,6 +1088,44 @@ rec_offs_any_extern( return(UNIV_UNLIKELY(*rec_offs_base(offsets) & REC_OFFS_EXTERNAL)); } +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG +/******************************************************//** +Determine if the offsets are for a record containing null BLOB pointers. +@return first field containing a null BLOB pointer, or NULL if none found */ +UNIV_INLINE +const byte* +rec_offs_any_null_extern( +/*=====================*/ + const rec_t* rec, /*!< in: record */ + const ulint* offsets) /*!< in: rec_get_offsets(rec) */ +{ + ulint i; + ut_ad(rec_offs_validate(rec, NULL, offsets)); + + if (!rec_offs_any_extern(offsets)) { + return(NULL); + } + + for (i = 0; i < rec_offs_n_fields(offsets); i++) { + if (rec_offs_nth_extern(offsets, i)) { + ulint len; + const byte* field + = rec_get_nth_field(rec, offsets, i, &len); + + ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE); + if (!memcmp(field + len + - BTR_EXTERN_FIELD_REF_SIZE, + field_ref_zero, + BTR_EXTERN_FIELD_REF_SIZE)) { + return(field); + } + } + } + + return(NULL); +} +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ + /******************************************************//** Returns nonzero if the extern bit is set in nth field of rec. @return nonzero if externally stored */ diff --git a/storage/xtradb/include/row0row.h b/storage/xtradb/include/row0row.h index 723b7b53395..36fb26482ce 100644 --- a/storage/xtradb/include/row0row.h +++ b/storage/xtradb/include/row0row.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -38,16 +38,16 @@ Created 4/20/1996 Heikki Tuuri #include "btr0types.h" /*********************************************************************//** -Gets the offset of the trx id field, in bytes relative to the origin of +Gets the offset of the DB_TRX_ID field, in bytes relative to the origin of a clustered index record. @return offset of DATA_TRX_ID */ -UNIV_INTERN +UNIV_INLINE ulint row_get_trx_id_offset( /*==================*/ - const rec_t* rec, /*!< in: record */ - dict_index_t* index, /*!< in: clustered index */ - const ulint* offsets);/*!< in: rec_get_offsets(rec, index) */ + const dict_index_t* index, /*!< in: clustered index */ + const ulint* offsets)/*!< in: record offsets */ + __attribute__((nonnull, warn_unused_result)); /*********************************************************************//** Reads the trx id field from a clustered index record. @return value of the field */ @@ -55,9 +55,10 @@ UNIV_INLINE trx_id_t row_get_rec_trx_id( /*===============*/ - const rec_t* rec, /*!< in: record */ - dict_index_t* index, /*!< in: clustered index */ - const ulint* offsets);/*!< in: rec_get_offsets(rec, index) */ + const rec_t* rec, /*!< in: record */ + const dict_index_t* index, /*!< in: clustered index */ + const ulint* offsets)/*!< in: rec_get_offsets(rec, index) */ + __attribute__((nonnull, warn_unused_result)); /*********************************************************************//** Reads the roll pointer field from a clustered index record. @return value of the field */ @@ -65,9 +66,10 @@ UNIV_INLINE roll_ptr_t row_get_rec_roll_ptr( /*=================*/ - const rec_t* rec, /*!< in: record */ - dict_index_t* index, /*!< in: clustered index */ - const ulint* offsets);/*!< in: rec_get_offsets(rec, index) */ + const rec_t* rec, /*!< in: record */ + const dict_index_t* index, /*!< in: clustered index */ + const ulint* offsets)/*!< in: rec_get_offsets(rec, index) */ + __attribute__((nonnull, warn_unused_result)); /*****************************************************************//** When an insert or purge to a table is performed, this function builds the entry to be inserted into or purged from an index on the table. diff --git a/storage/xtradb/include/row0row.ic b/storage/xtradb/include/row0row.ic index 05c007641af..0b9ca982af8 100644 --- a/storage/xtradb/include/row0row.ic +++ b/storage/xtradb/include/row0row.ic @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -28,15 +28,42 @@ Created 4/20/1996 Heikki Tuuri #include "trx0undo.h" /*********************************************************************//** +Gets the offset of the DB_TRX_ID field, in bytes relative to the origin of +a clustered index record. +@return offset of DATA_TRX_ID */ +UNIV_INLINE +ulint +row_get_trx_id_offset( +/*==================*/ + const dict_index_t* index, /*!< in: clustered index */ + const ulint* offsets)/*!< in: record offsets */ +{ + ulint pos; + ulint offset; + ulint len; + + ut_ad(dict_index_is_clust(index)); + ut_ad(rec_offs_validate(NULL, index, offsets)); + + pos = dict_index_get_sys_col_pos(index, DATA_TRX_ID); + + offset = rec_get_nth_field_offs(offsets, pos, &len); + + ut_ad(len == DATA_TRX_ID_LEN); + + return(offset); +} + +/*********************************************************************//** Reads the trx id field from a clustered index record. @return value of the field */ UNIV_INLINE trx_id_t row_get_rec_trx_id( /*===============*/ - const rec_t* rec, /*!< in: record */ - dict_index_t* index, /*!< in: clustered index */ - const ulint* offsets)/*!< in: rec_get_offsets(rec, index) */ + const rec_t* rec, /*!< in: record */ + const dict_index_t* index, /*!< in: clustered index */ + const ulint* offsets)/*!< in: rec_get_offsets(rec, index) */ { ulint offset; @@ -46,7 +73,7 @@ row_get_rec_trx_id( offset = index->trx_id_offset; if (!offset) { - offset = row_get_trx_id_offset(rec, index, offsets); + offset = row_get_trx_id_offset(index, offsets); } return(trx_read_trx_id(rec + offset)); @@ -59,9 +86,9 @@ UNIV_INLINE roll_ptr_t row_get_rec_roll_ptr( /*=================*/ - const rec_t* rec, /*!< in: record */ - dict_index_t* index, /*!< in: clustered index */ - const ulint* offsets)/*!< in: rec_get_offsets(rec, index) */ + const rec_t* rec, /*!< in: record */ + const dict_index_t* index, /*!< in: clustered index */ + const ulint* offsets)/*!< in: rec_get_offsets(rec, index) */ { ulint offset; @@ -71,7 +98,7 @@ row_get_rec_roll_ptr( offset = index->trx_id_offset; if (!offset) { - offset = row_get_trx_id_offset(rec, index, offsets); + offset = row_get_trx_id_offset(index, offsets); } return(trx_read_roll_ptr(rec + offset + DATA_TRX_ID_LEN)); diff --git a/storage/xtradb/include/row0upd.ic b/storage/xtradb/include/row0upd.ic index 18e22f1eca9..0894ed373b0 100644 --- a/storage/xtradb/include/row0upd.ic +++ b/storage/xtradb/include/row0upd.ic @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -171,7 +171,7 @@ row_upd_rec_sys_fields( ulint offset = index->trx_id_offset; if (!offset) { - offset = row_get_trx_id_offset(rec, index, offsets); + offset = row_get_trx_id_offset(index, offsets); } #if DATA_TRX_ID + 1 != DATA_ROLL_PTR diff --git a/storage/xtradb/include/srv0srv.h b/storage/xtradb/include/srv0srv.h index d4329d16a62..c120db5cbcc 100644 --- a/storage/xtradb/include/srv0srv.h +++ b/storage/xtradb/include/srv0srv.h @@ -159,13 +159,10 @@ extern ulint srv_buf_pool_curr_size; /*!< current size in bytes */ extern ulint srv_mem_pool_size; extern ulint srv_lock_table_size; -extern uint srv_buffer_pool_shm_key; -extern ibool srv_buffer_pool_shm_is_reused; -extern ibool srv_buffer_pool_shm_checksum; - extern ibool srv_thread_concurrency_timer_based; extern ulint srv_n_file_io_threads; +extern my_bool srv_random_read_ahead; extern ulong srv_read_ahead_threshold; extern ulint srv_n_read_io_threads; extern ulint srv_n_write_io_threads; @@ -288,6 +285,7 @@ extern ibool srv_print_latch_waits; extern ulint srv_activity_count; extern ulint srv_fatal_semaphore_wait_threshold; extern ulint srv_dml_needed_delay; +extern lint srv_kill_idle_transaction; extern mutex_t* kernel_mutex_temp;/* mutex protecting the server, trx structs, query threads, and lock table: we allocate @@ -351,6 +349,9 @@ extern ulint srv_buf_pool_reads; /** Time in seconds between automatic buffer pool dumps */ extern uint srv_auto_lru_dump; +/** Whether startup should be blocked until buffer pool is fully restored */ +extern ibool srv_blocking_lru_restore; + /** Status variables to be passed to MySQL */ typedef struct export_var_struct export_struc; @@ -694,6 +695,7 @@ struct export_var_struct{ ulint innodb_buffer_pool_wait_free; /*!< srv_buf_pool_wait_free */ ulint innodb_buffer_pool_pages_flushed; /*!< srv_buf_pool_flushed */ ulint innodb_buffer_pool_write_requests;/*!< srv_buf_pool_write_requests */ + ulint innodb_buffer_pool_read_ahead_rnd;/*!< srv_read_ahead_rnd */ ulint innodb_buffer_pool_read_ahead; /*!< srv_read_ahead */ ulint innodb_buffer_pool_read_ahead_evicted;/*!< srv_read_ahead evicted*/ ulint innodb_deadlocks; /* ??? */ diff --git a/storage/xtradb/include/sync0arr.h b/storage/xtradb/include/sync0arr.h index 5f1280f5e28..6e931346238 100644 --- a/storage/xtradb/include/sync0arr.h +++ b/storage/xtradb/include/sync0arr.h @@ -115,8 +115,11 @@ Prints warnings of long semaphore waits to stderr. @return TRUE if fatal semaphore wait threshold was exceeded */ UNIV_INTERN ibool -sync_array_print_long_waits(void); -/*=============================*/ +sync_array_print_long_waits( +/*========================*/ + os_thread_id_t* waiter, /*!< out: longest waiting thread */ + const void** sema) /*!< out: longest-waited-for semaphore */ + __attribute__((nonnull)); /********************************************************************//** Validates the integrity of the wait array. Checks that the number of reserved cells equals the count variable. */ diff --git a/storage/xtradb/include/sync0rw.ic b/storage/xtradb/include/sync0rw.ic index 7116f1b7c9b..485a63a1b18 100644 --- a/storage/xtradb/include/sync0rw.ic +++ b/storage/xtradb/include/sync0rw.ic @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2008, Google Inc. Portions of this file contain modifications contributed and copyrighted by @@ -603,16 +603,16 @@ rw_lock_x_unlock_direct( ut_ad((lock->lock_word % X_LOCK_DECR) == 0); -#ifdef UNIV_SYNC_DEBUG - rw_lock_remove_debug_info(lock, 0, RW_LOCK_EX); -#endif - if (lock->lock_word == 0) { lock->recursive = FALSE; UNIV_MEM_INVALID(&lock->writer_thread, sizeof lock->writer_thread); } +#ifdef UNIV_SYNC_DEBUG + rw_lock_remove_debug_info(lock, 0, RW_LOCK_EX); +#endif + lock->lock_word += X_LOCK_DECR; ut_ad(!lock->waiters); diff --git a/storage/xtradb/include/sync0sync.h b/storage/xtradb/include/sync0sync.h index f2ff83101ab..8b9a075e875 100644 --- a/storage/xtradb/include/sync0sync.h +++ b/storage/xtradb/include/sync0sync.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved. +Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2008, Google Inc. Portions of this file contain modifications contributed and copyrighted by @@ -219,8 +219,10 @@ void sync_thread_add_level( /*==================*/ void* latch, /*!< in: pointer to a mutex or an rw-lock */ - ulint level); /*!< in: level in the latching order; if + ulint level, /*!< in: level in the latching order; if SYNC_LEVEL_VARYING, nothing is done */ + ibool relock) /*!< in: TRUE if re-entering an x-lock */ + __attribute__((nonnull)); /******************************************************************//** Removes a latch from the thread level array if it is found there. @return TRUE if found in the array; it is no error if the latch is diff --git a/storage/xtradb/include/trx0sys.h b/storage/xtradb/include/trx0sys.h index 2637189f37e..eafa1ab6409 100644 --- a/storage/xtradb/include/trx0sys.h +++ b/storage/xtradb/include/trx0sys.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -317,6 +317,17 @@ ibool trx_in_trx_list( /*============*/ trx_t* in_trx);/*!< in: trx */ +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG +/***********************************************************//** +Assert that a transaction has been recovered. +@return TRUE */ +UNIV_INLINE +ibool +trx_assert_recovered( +/*=================*/ + trx_id_t trx_id) /*!< in: transaction identifier */ + __attribute__((warn_unused_result)); +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ /*****************************************************************//** Updates the offset information about the end of the MySQL binlog entry which corresponds to the transaction just being committed. In a MySQL diff --git a/storage/xtradb/include/trx0sys.ic b/storage/xtradb/include/trx0sys.ic index 5e0f07c8b9d..234fc0b92e9 100644 --- a/storage/xtradb/include/trx0sys.ic +++ b/storage/xtradb/include/trx0sys.ic @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -311,6 +311,28 @@ trx_get_on_id( return(NULL); } +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG +/***********************************************************//** +Assert that a transaction has been recovered. +@return TRUE */ +UNIV_INLINE +ibool +trx_assert_recovered( +/*=================*/ + trx_id_t trx_id) /*!< in: transaction identifier */ +{ + trx_t* trx; + + mutex_enter(&kernel_mutex); + trx = trx_get_on_id(trx_id); + ut_a(trx); + ut_a(trx->is_recovered); + mutex_exit(&kernel_mutex); + + return(TRUE); +} +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ + /****************************************************************//** Returns the minumum trx id in trx list. This is the smallest id for which the trx can possibly be active. (But, you must look at the trx->conc_state to diff --git a/storage/xtradb/include/trx0trx.h b/storage/xtradb/include/trx0trx.h index 173c63918d3..700476bcd85 100644 --- a/storage/xtradb/include/trx0trx.h +++ b/storage/xtradb/include/trx0trx.h @@ -44,6 +44,9 @@ extern sess_t* trx_dummy_sess; /** Number of transactions currently allocated for MySQL: protected by the kernel mutex */ extern ulint trx_n_mysql_transactions; +/** Number of transactions currently in the XA PREPARED state: protected by +the kernel mutex */ +extern ulint trx_n_prepared; /********************************************************************//** Releases the search latch if trx has reserved it. */ @@ -108,6 +111,14 @@ trx_free( /*=====*/ trx_t* trx); /*!< in, own: trx object */ /********************************************************************//** +At shutdown, frees a transaction object that is in the PREPARED state. */ +UNIV_INTERN +void +trx_free_prepared( +/*==============*/ + trx_t* trx) /*!< in, own: trx object */ + __attribute__((nonnull)); +/********************************************************************//** Frees a transaction object for MySQL. */ UNIV_INTERN void @@ -498,6 +509,7 @@ struct trx_struct{ 150 bytes in the undo log size as then we skip XA steps */ ulint flush_log_at_trx_commit_session; + ulint fake_changes; ulint flush_log_later;/* In 2PC, we hold the prepare_commit mutex across both phases. In that case, we @@ -589,6 +601,8 @@ struct trx_struct{ ulint mysql_process_no;/* since in Linux, 'top' reports process id's and not thread id's, we store the process number too */ + time_t idle_start; + ib_int64_t last_stmt_start; /*------------------------------*/ ulint n_mysql_tables_in_use; /* number of Innobase tables used in the processing of the current diff --git a/storage/xtradb/include/trx0undo.h b/storage/xtradb/include/trx0undo.h index a084f2394b5..4f15cd85833 100644 --- a/storage/xtradb/include/trx0undo.h +++ b/storage/xtradb/include/trx0undo.h @@ -298,6 +298,15 @@ void trx_undo_insert_cleanup( /*====================*/ trx_t* trx); /*!< in: transaction handle */ + +/********************************************************************//** +At shutdown, frees the undo logs of a PREPARED transaction. */ +UNIV_INTERN +void +trx_undo_free_prepared( +/*===================*/ + trx_t* trx) /*!< in/out: PREPARED transaction */ + __attribute__((nonnull)); #endif /* !UNIV_HOTBACKUP */ /***********************************************************//** Parses the redo log entry of an undo log page initialization. diff --git a/storage/xtradb/include/univ.i b/storage/xtradb/include/univ.i index 7b11a16dae9..902d0c94ddd 100644 --- a/storage/xtradb/include/univ.i +++ b/storage/xtradb/include/univ.i @@ -1,8 +1,7 @@ /***************************************************************************** -Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved. +Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2008, Google Inc. -Copyright (c) 2009, Sun Microsystems, Inc. Portions of this file contain modifications contributed and copyrighted by Google, Inc. Those modifications are gratefully acknowledged and are described @@ -46,8 +45,8 @@ Created 1/20/1994 Heikki Tuuri #define INNODB_VERSION_MAJOR 1 #define INNODB_VERSION_MINOR 0 -#define INNODB_VERSION_BUGFIX 15 -#define PERCONA_INNODB_VERSION 12.7 +#define INNODB_VERSION_BUGFIX 17 +#define PERCONA_INNODB_VERSION 13.0 /* The following is the InnoDB version as shown in SELECT plugin_version FROM information_schema.plugins; @@ -197,6 +196,8 @@ this will break redo log file compatibility, but it may be useful when debugging redo log application problems. */ #define UNIV_MEM_DEBUG /* detect memory leaks etc */ #define UNIV_IBUF_DEBUG /* debug the insert buffer */ +#define UNIV_BLOB_DEBUG /* track BLOB ownership; +assumes that no BLOBs survive server restart */ #define UNIV_IBUF_COUNT_DEBUG /* debug the insert buffer; this limits the database to IBUF_COUNT_N_SPACES and IBUF_COUNT_N_PAGES, and the insert buffer must be empty when the database is started */ diff --git a/storage/xtradb/include/ut0lst.h b/storage/xtradb/include/ut0lst.h index 245dfc226c3..7b15c052978 100644 --- a/storage/xtradb/include/ut0lst.h +++ b/storage/xtradb/include/ut0lst.h @@ -257,48 +257,5 @@ do { \ ut_a(ut_list_node_313 == NULL); \ } while (0) -/********************************************************************//** -Align nodes with moving location. -@param NAME the name of the list -@param TYPE node type -@param BASE base node (not a pointer to it) -@param OFFSET offset moved */ -#define UT_LIST_OFFSET(NAME, TYPE, BASE, FADDR, FOFFSET, BOFFSET) \ -do { \ - ulint ut_list_i_313; \ - TYPE* ut_list_node_313; \ - \ - if ((BASE).start) \ - (BASE).start = (void*)((byte*)((BASE).start) \ - + (((void*)((BASE).start) > (void*)FADDR)?FOFFSET:BOFFSET));\ - if ((BASE).end) \ - (BASE).end = (void*)((byte*)((BASE).end) \ - + (((void*)((BASE).end) > (void*)FADDR)?FOFFSET:BOFFSET));\ - \ - ut_list_node_313 = (BASE).start; \ - \ - for (ut_list_i_313 = (BASE).count; ut_list_i_313--; ) { \ - ut_a(ut_list_node_313); \ - if ((ut_list_node_313->NAME).prev) \ - (ut_list_node_313->NAME).prev = (void*)((byte*)((ut_list_node_313->NAME).prev)\ - + (((void*)((ut_list_node_313->NAME).prev) > (void*)FADDR)?FOFFSET:BOFFSET));\ - if ((ut_list_node_313->NAME).next) \ - (ut_list_node_313->NAME).next = (void*)((byte*)((ut_list_node_313->NAME).next)\ - + (((void*)((ut_list_node_313->NAME).next)> (void*)FADDR)?FOFFSET:BOFFSET));\ - ut_list_node_313 = (ut_list_node_313->NAME).next; \ - } \ - \ - ut_a(ut_list_node_313 == NULL); \ - \ - ut_list_node_313 = (BASE).end; \ - \ - for (ut_list_i_313 = (BASE).count; ut_list_i_313--; ) { \ - ut_a(ut_list_node_313); \ - ut_list_node_313 = (ut_list_node_313->NAME).prev; \ - } \ - \ - ut_a(ut_list_node_313 == NULL); \ -} while (0) - #endif diff --git a/storage/xtradb/include/ut0mem.h b/storage/xtradb/include/ut0mem.h index f14606be966..9c6ee9049ec 100644 --- a/storage/xtradb/include/ut0mem.h +++ b/storage/xtradb/include/ut0mem.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -210,43 +210,6 @@ ut_strlcpy_rev( ulint size); /*!< in: size of destination buffer */ /**********************************************************************//** -Compute strlen(ut_strcpyq(str, q)). -@return length of the string when quoted */ -UNIV_INLINE -ulint -ut_strlenq( -/*=======*/ - const char* str, /*!< in: null-terminated string */ - char q); /*!< in: the quote character */ - -/**********************************************************************//** -Make a quoted copy of a NUL-terminated string. Leading and trailing -quotes will not be included; only embedded quotes will be escaped. -See also ut_strlenq() and ut_memcpyq(). -@return pointer to end of dest */ -UNIV_INTERN -char* -ut_strcpyq( -/*=======*/ - char* dest, /*!< in: output buffer */ - char q, /*!< in: the quote character */ - const char* src); /*!< in: null-terminated string */ - -/**********************************************************************//** -Make a quoted copy of a fixed-length string. Leading and trailing -quotes will not be included; only embedded quotes will be escaped. -See also ut_strlenq() and ut_strcpyq(). -@return pointer to end of dest */ -UNIV_INTERN -char* -ut_memcpyq( -/*=======*/ - char* dest, /*!< in: output buffer */ - char q, /*!< in: the quote character */ - const char* src, /*!< in: string to be quoted */ - ulint len); /*!< in: length of src */ - -/**********************************************************************//** Return the number of times s2 occurs in s1. Overlapping instances of s2 are only counted once. @return the number of times s2 occurs in s1 */ diff --git a/storage/xtradb/include/ut0mem.ic b/storage/xtradb/include/ut0mem.ic index f36c28f1989..c06e2b3ae81 100644 --- a/storage/xtradb/include/ut0mem.ic +++ b/storage/xtradb/include/ut0mem.ic @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -99,27 +99,6 @@ ut_strcmp(const char* str1, const char* str2) } /**********************************************************************//** -Compute strlen(ut_strcpyq(str, q)). -@return length of the string when quoted */ -UNIV_INLINE -ulint -ut_strlenq( -/*=======*/ - const char* str, /*!< in: null-terminated string */ - char q) /*!< in: the quote character */ -{ - ulint len; - - for (len = 0; *str; len++, str++) { - if (*str == q) { - len++; - } - } - - return(len); -} - -/**********************************************************************//** Converts a raw binary data to a NUL-terminated hex string. The output is truncated if there is not enough space in "hex", make sure "hex_size" is at least (2 * raw_size + 1) if you do not want this to happen. Returns the diff --git a/storage/xtradb/lock/lock0lock.c b/storage/xtradb/lock/lock0lock.c index 4fcb5b2c522..e5da4f46ec9 100644 --- a/storage/xtradb/lock/lock0lock.c +++ b/storage/xtradb/lock/lock0lock.c @@ -3908,6 +3908,10 @@ lock_table( trx = thr_get_trx(thr); + if (trx->fake_changes && mode == LOCK_IX) { + mode = LOCK_IS; + } + lock_mutex_enter_kernel(); /* Look for stronger locks the same trx already has on the table */ @@ -5109,6 +5113,11 @@ lock_rec_insert_check_and_lock( } trx = thr_get_trx(thr); + + if (trx->fake_changes) { + return(DB_SUCCESS); + } + next_rec = page_rec_get_next_const(rec); next_rec_heap_no = page_rec_get_heap_no(next_rec); @@ -5277,6 +5286,10 @@ lock_clust_rec_modify_check_and_lock( return(DB_SUCCESS); } + if (thr && thr_get_trx(thr)->fake_changes) { + return(DB_SUCCESS); + } + heap_no = rec_offs_comp(offsets) ? rec_get_heap_no_new(rec) : rec_get_heap_no_old(rec); @@ -5335,6 +5348,10 @@ lock_sec_rec_modify_check_and_lock( return(DB_SUCCESS); } + if (thr && thr_get_trx(thr)->fake_changes) { + return(DB_SUCCESS); + } + heap_no = page_rec_get_heap_no(rec); /* Another transaction cannot have an implicit lock on the record, @@ -5422,6 +5439,10 @@ lock_sec_rec_read_check_and_lock( return(DB_SUCCESS); } + if (thr && thr_get_trx(thr)->fake_changes && mode == LOCK_X) { + mode = LOCK_S; + } + heap_no = page_rec_get_heap_no(rec); lock_mutex_enter_kernel(); @@ -5499,6 +5520,10 @@ lock_clust_rec_read_check_and_lock( return(DB_SUCCESS); } + if (thr && thr_get_trx(thr)->fake_changes && mode == LOCK_X) { + mode = LOCK_S; + } + heap_no = page_rec_get_heap_no(rec); lock_mutex_enter_kernel(); diff --git a/storage/xtradb/log/log0log.c b/storage/xtradb/log/log0log.c index c39f60bd4b9..fc8745629d6 100644 --- a/storage/xtradb/log/log0log.c +++ b/storage/xtradb/log/log0log.c @@ -1114,7 +1114,7 @@ log_io_complete( && srv_unix_file_flush_method != SRV_UNIX_ALL_O_DIRECT && srv_unix_file_flush_method != SRV_UNIX_NOSYNC) { - fil_flush(group->space_id); + fil_flush(group->space_id, FALSE); } #ifdef UNIV_DEBUG @@ -1137,7 +1137,7 @@ log_io_complete( && srv_unix_file_flush_method != SRV_UNIX_NOSYNC && srv_flush_log_at_trx_commit != 2) { - fil_flush(group->space_id); + fil_flush(group->space_id, FALSE); } mutex_enter(&(log_sys->mutex)); @@ -1528,7 +1528,7 @@ loop: group = UT_LIST_GET_FIRST(log_sys->log_groups); - fil_flush(group->space_id); + fil_flush(group->space_id, FALSE); log_sys->flushed_to_disk_lsn = log_sys->write_lsn; } @@ -2623,7 +2623,7 @@ log_io_complete_archive(void) mutex_exit(&(log_sys->mutex)); - fil_flush(group->archive_space_id); + fil_flush(group->archive_space_id, TRUE); mutex_enter(&(log_sys->mutex)); @@ -3122,12 +3122,13 @@ loop: goto loop; } - /* Check that there are no longer transactions. We need this wait even - for the 'very fast' shutdown, because the InnoDB layer may have - committed or prepared transactions and we don't want to lose them. */ + /* Check that there are no longer transactions, except for + PREPARED ones. We need this wait even for the 'very fast' + shutdown, because the InnoDB layer may have committed or + prepared transactions and we don't want to lose them. */ if (trx_n_mysql_transactions > 0 - || UT_LIST_GET_LEN(trx_sys->trx_list) > 0) { + || UT_LIST_GET_LEN(trx_sys->trx_list) > trx_n_prepared) { mutex_exit(&kernel_mutex); diff --git a/storage/xtradb/log/log0recv.c b/storage/xtradb/log/log0recv.c index 895067c700a..fae7fbd0da0 100644 --- a/storage/xtradb/log/log0recv.c +++ b/storage/xtradb/log/log0recv.c @@ -2899,7 +2899,6 @@ recv_init_crash_recovery(void) /*==========================*/ { ut_a(!recv_needed_recovery); - ut_a(!srv_buffer_pool_shm_is_reused); recv_needed_recovery = TRUE; @@ -3622,7 +3621,7 @@ recv_reset_log_files_for_backup( exit(1); } - os_file_flush(log_file); + os_file_flush(log_file, TRUE); os_file_close(log_file); } @@ -3645,7 +3644,7 @@ recv_reset_log_files_for_backup( os_file_write(name, log_file, buf, 0, 0, LOG_FILE_HDR_SIZE + OS_FILE_LOG_BLOCK_SIZE); - os_file_flush(log_file); + os_file_flush(log_file, TRUE); os_file_close(log_file); ut_free(buf); diff --git a/storage/xtradb/mtr/mtr0mtr.c b/storage/xtradb/mtr/mtr0mtr.c index 34e6d3ffc92..a9f1c35f84c 100644 --- a/storage/xtradb/mtr/mtr0mtr.c +++ b/storage/xtradb/mtr/mtr0mtr.c @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -248,40 +248,6 @@ mtr_commit( } #ifndef UNIV_HOTBACKUP -/**********************************************************//** -Releases the latches stored in an mtr memo down to a savepoint. -NOTE! The mtr must not have made changes to buffer pages after the -savepoint, as these can be handled only by mtr_commit. */ -UNIV_INTERN -void -mtr_rollback_to_savepoint( -/*======================*/ - mtr_t* mtr, /*!< in: mtr */ - ulint savepoint) /*!< in: savepoint */ -{ - mtr_memo_slot_t* slot; - dyn_array_t* memo; - ulint offset; - - ut_ad(mtr); - ut_ad(mtr->magic_n == MTR_MAGIC_N); - ut_ad(mtr->state == MTR_ACTIVE); - - memo = &(mtr->memo); - - offset = dyn_array_get_data_size(memo); - ut_ad(offset >= savepoint); - - while (offset > savepoint) { - offset -= sizeof(mtr_memo_slot_t); - - slot = dyn_array_get_element(memo, offset); - - ut_ad(slot->type != MTR_MEMO_MODIFY); - mtr_memo_slot_release(mtr, slot); - } -} - /***************************************************//** Releases an object in the memo stack. */ UNIV_INTERN diff --git a/storage/xtradb/os/os0file.c b/storage/xtradb/os/os0file.c index 716777227eb..1e863109bed 100644 --- a/storage/xtradb/os/os0file.c +++ b/storage/xtradb/os/os0file.c @@ -1912,7 +1912,7 @@ os_file_set_size( ut_free(buf2); - ret = os_file_flush(file); + ret = os_file_flush(file, TRUE); if (ret) { return(TRUE); @@ -1950,7 +1950,8 @@ static int os_file_fsync( /*==========*/ - os_file_t file) /*!< in: handle to a file */ + os_file_t file, /*!< in: handle to a file */ + ibool metadata) { int ret; int failures; @@ -1959,7 +1960,16 @@ os_file_fsync( failures = 0; do { +#if defined(HAVE_FDATASYNC) && HAVE_DECL_FDATASYNC + if (metadata) { + ret = fsync(file); + } else { + ret = fdatasync(file); + } +#else + (void) metadata; ret = fsync(file); +#endif os_n_fsyncs++; @@ -1978,6 +1988,9 @@ os_file_fsync( failures++; retry = TRUE; + } else if (ret == -1 && errno == EINTR) { + /* Handle signal interruptions correctly */ + retry = TRUE; } else { retry = FALSE; @@ -1995,7 +2008,8 @@ UNIV_INTERN ibool os_file_flush( /*==========*/ - os_file_t file) /*!< in, own: handle to a file */ + os_file_t file, /*!< in, own: handle to a file */ + ibool metadata) { #ifdef __WIN__ BOOL ret; @@ -2045,18 +2059,18 @@ os_file_flush( /* If we are not on an operating system that supports this, then fall back to a plain fsync. */ - ret = os_file_fsync(file); + ret = os_file_fsync(file, metadata); } else { ret = fcntl(file, F_FULLFSYNC, NULL); if (ret) { /* If we are not on a file system that supports this, then fall back to a plain fsync. */ - ret = os_file_fsync(file); + ret = os_file_fsync(file, metadata); } } #else - ret = os_file_fsync(file); + ret = os_file_fsync(file, metadata); #endif if (ret == 0) { @@ -2109,6 +2123,7 @@ _os_file_pread( off_t offs; #if defined(HAVE_PREAD) && !defined(HAVE_BROKEN_PREAD) ssize_t n_bytes; + ssize_t n_read; #endif /* HAVE_PREAD && !HAVE_BROKEN_PREAD */ ulint sec; ulint ms; @@ -2149,7 +2164,18 @@ _os_file_pread( os_n_pending_reads++; os_mutex_exit(os_file_count_mutex); - n_bytes = pread(file, buf, (ssize_t)n, offs); + /* Handle signal interruptions correctly */ + for (n_bytes = 0; n_bytes < (ssize_t) n; ) { + n_read = pread(file, buf, (ssize_t)n, offs); + if (n_read > 0) { + n_bytes += n_read; + offs += n_read; + } else if (n_read == -1 && errno == EINTR) { + continue; + } else { + break; + } + } os_mutex_enter(os_file_count_mutex); os_file_n_pending_preads--; @@ -2168,6 +2194,7 @@ _os_file_pread( { off_t ret_offset; ssize_t ret; + ssize_t n_read; #ifndef UNIV_HOTBACKUP ulint i; #endif /* !UNIV_HOTBACKUP */ @@ -2188,7 +2215,17 @@ _os_file_pread( if (ret_offset < 0) { ret = -1; } else { - ret = read(file, buf, (ssize_t)n); + /* Handle signal interruptions correctly */ + for (ret = 0; ret < (ssize_t) n; ) { + n_read = read(file, buf, (ssize_t)n); + if (n_read > 0) { + ret += n_read; + } else if (n_read == -1 && errno == EINTR) { + continue; + } else { + break; + } + } } #ifndef UNIV_HOTBACKUP @@ -2227,6 +2264,7 @@ os_file_pwrite( offset */ { ssize_t ret; + ssize_t n_written; off_t offs; ut_a((offset & 0xFFFFFFFFUL) == offset); @@ -2254,7 +2292,18 @@ os_file_pwrite( os_n_pending_writes++; os_mutex_exit(os_file_count_mutex); - ret = pwrite(file, buf, (ssize_t)n, offs); + /* Handle signal interruptions correctly */ + for (ret = 0; ret < (ssize_t) n; ) { + n_written = pwrite(file, buf, (ssize_t)n, offs); + if (n_written > 0) { + ret += n_written; + offs += n_written; + } else if (n_written == -1 && errno == EINTR) { + continue; + } else { + break; + } + } os_mutex_enter(os_file_count_mutex); os_file_n_pending_pwrites--; @@ -2270,7 +2319,7 @@ os_file_pwrite( the OS crashes, a database page is only partially physically written to disk. */ - ut_a(TRUE == os_file_flush(file)); + ut_a(TRUE == os_file_flush(file, TRUE)); } # endif /* UNIV_DO_FLUSH */ @@ -2301,7 +2350,17 @@ os_file_pwrite( goto func_exit; } - ret = write(file, buf, (ssize_t)n); + /* Handle signal interruptions correctly */ + for (ret = 0; ret < (ssize_t) n; ) { + n_written = write(file, buf, (ssize_t)n); + if (n_written > 0) { + ret += n_written; + } else if (n_written == -1 && errno == EINTR) { + continue; + } else { + break; + } + } # ifdef UNIV_DO_FLUSH if (srv_unix_file_flush_method != SRV_UNIX_LITTLESYNC @@ -2312,7 +2371,7 @@ os_file_pwrite( the OS crashes, a database page is only partially physically written to disk. */ - ut_a(TRUE == os_file_flush(file)); + ut_a(TRUE == os_file_flush(file, TRUE)); } # endif /* UNIV_DO_FLUSH */ @@ -2679,7 +2738,7 @@ retry: # ifdef UNIV_DO_FLUSH if (!os_do_not_call_flush_at_each_write) { - ut_a(TRUE == os_file_flush(file)); + ut_a(TRUE == os_file_flush(file, TRUE)); } # endif /* UNIV_DO_FLUSH */ @@ -3940,7 +3999,7 @@ os_aio_windows_handle( #ifdef UNIV_DO_FLUSH if (slot->type == OS_FILE_WRITE && !os_do_not_call_flush_at_each_write) { - ut_a(TRUE == os_file_flush(slot->file)); + ut_a(TRUE == os_file_flush(slot->file, TRUE)); } #endif /* UNIV_DO_FLUSH */ } else if (os_file_handle_error(slot->name, "Windows aio")) { diff --git a/storage/xtradb/os/os0proc.c b/storage/xtradb/os/os0proc.c index 4567d96b6f4..48922886f23 100644 --- a/storage/xtradb/os/os0proc.c +++ b/storage/xtradb/os/os0proc.c @@ -229,173 +229,3 @@ os_mem_free_large( } #endif } - -/****************************************************************//** -Allocates or attaches and reuses shared memory segment. -The content is not cleared automatically. -@return allocated memory */ -UNIV_INTERN -void* -os_shm_alloc( -/*=========*/ - ulint* n, /*!< in/out: number of bytes */ - uint key, - ibool* is_new) -{ - void* ptr; -#if defined HAVE_SYS_IPC_H && HAVE_SYS_SHM_H - ulint size; - int shmid; - - *is_new = FALSE; - fprintf(stderr, - "InnoDB: The shared memory segment containing the buffer pool is: key %#x (%d).\n", - key, key); -# if defined HAVE_LARGE_PAGES && defined UNIV_LINUX - if (!os_use_large_pages || !os_large_page_size) { - goto skip; - } - - /* Align block size to os_large_page_size */ - ut_ad(ut_is_2pow(os_large_page_size)); - size = ut_2pow_round(*n + (os_large_page_size - 1), - os_large_page_size); - - shmid = shmget((key_t)key, (size_t)size, - IPC_CREAT | IPC_EXCL | SHM_HUGETLB | SHM_R | SHM_W); - if (shmid < 0) { - if (errno == EEXIST) { - fprintf(stderr, - "InnoDB: HugeTLB: The shared memory segment exists.\n"); - shmid = shmget((key_t)key, (size_t)size, - SHM_HUGETLB | SHM_R | SHM_W); - if (shmid < 0) { - fprintf(stderr, - "InnoDB: HugeTLB: Warning: Failed to allocate %lu bytes. (reuse) errno %d\n", - size, errno); - goto skip; - } else { - fprintf(stderr, - "InnoDB: HugeTLB: The existent shared memory segment is used.\n"); - } - } else { - fprintf(stderr, - "InnoDB: HugeTLB: Warning: Failed to allocate %lu bytes. (new) errno %d\n", - size, errno); - goto skip; - } - } else { - *is_new = TRUE; - fprintf(stderr, - "InnoDB: HugeTLB: A new shared memory segment has been created .\n"); - } - - ptr = shmat(shmid, NULL, 0); - if (ptr == (void *)-1) { - fprintf(stderr, - "InnoDB: HugeTLB: Warning: Failed to attach shared memory segment, errno %d\n", - errno); - ptr = NULL; - } - - if (ptr) { - *n = size; - os_fast_mutex_lock(&ut_list_mutex); - ut_total_allocated_memory += size; - os_fast_mutex_unlock(&ut_list_mutex); - UNIV_MEM_ALLOC(ptr, size); - return(ptr); - } -skip: - *is_new = FALSE; -# endif /* HAVE_LARGE_PAGES && defined UNIV_LINUX */ -# ifdef HAVE_GETPAGESIZE - size = getpagesize(); -# else - size = UNIV_PAGE_SIZE; -# endif - /* Align block size to system page size */ - ut_ad(ut_is_2pow(size)); - size = *n = ut_2pow_round(*n + (size - 1), size); - - shmid = shmget((key_t)key, (size_t)size, - IPC_CREAT | IPC_EXCL | SHM_R | SHM_W); - if (shmid < 0) { - if (errno == EEXIST) { - fprintf(stderr, - "InnoDB: A shared memory segment containing the buffer pool seems to already exist.\n"); - shmid = shmget((key_t)key, (size_t)size, - SHM_R | SHM_W); - if (shmid < 0) { - fprintf(stderr, - "InnoDB: Warning: Failed to allocate %lu bytes. (reuse) errno %d\n", - size, errno); - ptr = NULL; - goto end; - } else { - fprintf(stderr, - "InnoDB: The existent shared memory segment is used.\n"); - } - } else { - fprintf(stderr, - "InnoDB: Warning: Failed to allocate %lu bytes. (new) errno %d\n", - size, errno); - ptr = NULL; - goto end; - } - } else { - *is_new = TRUE; - fprintf(stderr, - "InnoDB: A new shared memory segment has been created.\n"); - } - - ptr = shmat(shmid, NULL, 0); - if (ptr == (void *)-1) { - fprintf(stderr, - "InnoDB: Warning: Failed to attach shared memory segment, errno %d\n", - errno); - ptr = NULL; - } - - if (ptr) { - *n = size; - os_fast_mutex_lock(&ut_list_mutex); - ut_total_allocated_memory += size; - os_fast_mutex_unlock(&ut_list_mutex); - UNIV_MEM_ALLOC(ptr, size); - } -end: -#else /* HAVE_SYS_IPC_H && HAVE_SYS_SHM_H */ - fprintf(stderr, "InnoDB: shared memory segment is not supported.\n"); - ptr = NULL; -#endif /* HAVE_SYS_IPC_H && HAVE_SYS_SHM_H */ - return(ptr); -} - -/****************************************************************//** -Detach shared memory segment. */ -UNIV_INTERN -void -os_shm_free( -/*========*/ - void *ptr, /*!< in: pointer returned by - os_shm_alloc() */ - ulint size) /*!< in: size returned by - os_shm_alloc() */ -{ - os_fast_mutex_lock(&ut_list_mutex); - ut_a(ut_total_allocated_memory >= size); - os_fast_mutex_unlock(&ut_list_mutex); - -#if defined HAVE_SYS_IPC_H && HAVE_SYS_SHM_H - if (!shmdt(ptr)) { - os_fast_mutex_lock(&ut_list_mutex); - ut_a(ut_total_allocated_memory >= size); - ut_total_allocated_memory -= size; - os_fast_mutex_unlock(&ut_list_mutex); - UNIV_MEM_FREE(ptr, size); - } -#else /* HAVE_SYS_IPC_H && HAVE_SYS_SHM_H */ - fprintf(stderr, "InnoDB: shared memory segment is not supported.\n"); -#endif /* HAVE_SYS_IPC_H && HAVE_SYS_SHM_H */ -} diff --git a/storage/xtradb/page/page0cur.c b/storage/xtradb/page/page0cur.c index f10f16a7dd9..b8c492328e8 100644 --- a/storage/xtradb/page/page0cur.c +++ b/storage/xtradb/page/page0cur.c @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -1149,6 +1149,8 @@ use_heap: current_rec, index, mtr); } + btr_blob_dbg_add_rec(insert_rec, index, offsets, "insert"); + return(insert_rec); } @@ -1178,14 +1180,15 @@ page_cur_insert_rec_zip_reorg( /* Before trying to reorganize the page, store the number of preceding records on the page. */ pos = page_rec_get_n_recs_before(rec); + ut_ad(pos > 0); if (page_zip_reorganize(block, index, mtr)) { /* The page was reorganized: Find rec by seeking to pos, and update *current_rec. */ - rec = page + PAGE_NEW_INFIMUM; - - while (--pos) { - rec = page + rec_get_next_offs(rec, TRUE); + if (pos > 1) { + rec = page_rec_get_nth(page, pos - 1); + } else { + rec = page + PAGE_NEW_INFIMUM; } *current_rec = rec; @@ -1195,10 +1198,12 @@ page_cur_insert_rec_zip_reorg( } /* Out of space: restore the page */ + btr_blob_dbg_remove(page, index, "insert_zip_fail"); if (!page_zip_decompress(page_zip, page, FALSE)) { ut_error; /* Memory corrupted? */ } ut_ad(page_validate(page, index)); + btr_blob_dbg_add(page, index, "insert_zip_fail"); return(NULL); } @@ -1279,6 +1284,12 @@ page_cur_insert_rec_zip( insert_rec = page_cur_insert_rec_zip_reorg( current_rec, block, index, insert_rec, page, page_zip, mtr); +#ifdef UNIV_DEBUG + if (insert_rec) { + rec_offs_make_valid( + insert_rec, index, offsets); + } +#endif /* UNIV_DEBUG */ } return(insert_rec); @@ -1490,6 +1501,8 @@ use_heap: page_zip_write_rec(page_zip, insert_rec, index, offsets, 1); + btr_blob_dbg_add_rec(insert_rec, index, offsets, "insert_zip_ok"); + /* 9. Write log record of the insert */ if (UNIV_LIKELY(mtr != NULL)) { page_cur_insert_rec_write_log(insert_rec, rec_size, @@ -1697,6 +1710,9 @@ page_copy_rec_list_end_to_created_page( heap_top += rec_size; + rec_offs_make_valid(insert_rec, index, offsets); + btr_blob_dbg_add_rec(insert_rec, index, offsets, "copy_end"); + page_cur_insert_rec_write_log(insert_rec, rec_size, prev_rec, index, mtr); prev_rec = insert_rec; @@ -1944,6 +1960,7 @@ page_cur_delete_rec( page_dir_slot_set_n_owned(cur_dir_slot, page_zip, cur_n_owned - 1); /* 6. Free the memory occupied by the record */ + btr_blob_dbg_remove_rec(current_rec, index, offsets, "delete"); page_mem_free(page, page_zip, current_rec, index, offsets); /* 7. Now we have decremented the number of owned records of the slot. diff --git a/storage/xtradb/page/page0page.c b/storage/xtradb/page/page0page.c index 10008f9ac25..a284b1480a3 100644 --- a/storage/xtradb/page/page0page.c +++ b/storage/xtradb/page/page0page.c @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved. +Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -685,12 +685,16 @@ page_copy_rec_list_end( if (UNIV_UNLIKELY (!page_zip_reorganize(new_block, index, mtr))) { + btr_blob_dbg_remove(new_page, index, + "copy_end_reorg_fail"); if (UNIV_UNLIKELY (!page_zip_decompress(new_page_zip, new_page, FALSE))) { ut_error; } ut_ad(page_validate(new_page, index)); + btr_blob_dbg_add(new_page, index, + "copy_end_reorg_fail"); return(NULL); } else { /* The page was reorganized: @@ -803,12 +807,16 @@ page_copy_rec_list_start( if (UNIV_UNLIKELY (!page_zip_reorganize(new_block, index, mtr))) { + btr_blob_dbg_remove(new_page, index, + "copy_start_reorg_fail"); if (UNIV_UNLIKELY (!page_zip_decompress(new_page_zip, new_page, FALSE))) { ut_error; } ut_ad(page_validate(new_page, index)); + btr_blob_dbg_add(new_page, index, + "copy_start_reorg_fail"); return(NULL); } else { /* The page was reorganized: @@ -1080,6 +1088,9 @@ page_delete_rec_list_end( /* Remove the record chain segment from the record chain */ page_rec_set_next(prev_rec, page_get_supremum_rec(page)); + btr_blob_dbg_op(page, rec, index, "delete_end", + btr_blob_dbg_remove_rec); + /* Catenate the deleted chain segment to the page free list */ page_rec_set_next(last_rec, page_header_get_ptr(page, PAGE_FREE)); @@ -1476,55 +1487,54 @@ page_dir_balance_slot( } } -#ifndef UNIV_HOTBACKUP /************************************************************//** -Returns the middle record of the record list. If there are an even number -of records in the list, returns the first record of the upper half-list. -@return middle record */ +Returns the nth record of the record list. +This is the inverse function of page_rec_get_n_recs_before(). +@return nth record */ UNIV_INTERN -rec_t* -page_get_middle_rec( -/*================*/ - page_t* page) /*!< in: page */ +const rec_t* +page_rec_get_nth_const( +/*===================*/ + const page_t* page, /*!< in: page */ + ulint nth) /*!< in: nth record */ { - page_dir_slot_t* slot; - ulint middle; + const page_dir_slot_t* slot; ulint i; ulint n_owned; - ulint count; - rec_t* rec; + const rec_t* rec; - /* This many records we must leave behind */ - middle = (page_get_n_recs(page) + PAGE_HEAP_NO_USER_LOW) / 2; - - count = 0; + ut_ad(nth < UNIV_PAGE_SIZE / (REC_N_NEW_EXTRA_BYTES + 1)); for (i = 0;; i++) { slot = page_dir_get_nth_slot(page, i); n_owned = page_dir_slot_get_n_owned(slot); - if (count + n_owned > middle) { + if (n_owned > nth) { break; } else { - count += n_owned; + nth -= n_owned; } } ut_ad(i > 0); slot = page_dir_get_nth_slot(page, i - 1); - rec = (rec_t*) page_dir_slot_get_rec(slot); - rec = page_rec_get_next(rec); - - /* There are now count records behind rec */ + rec = page_dir_slot_get_rec(slot); - for (i = 0; i < middle - count; i++) { - rec = page_rec_get_next(rec); + if (page_is_comp(page)) { + do { + rec = page_rec_get_next_low(rec, TRUE); + ut_ad(rec); + } while (nth--); + } else { + do { + rec = page_rec_get_next_low(rec, FALSE); + ut_ad(rec); + } while (nth--); } return(rec); } -#endif /* !UNIV_HOTBACKUP */ /***************************************************************//** Returns the number of records before the given record in chain. @@ -1586,6 +1596,7 @@ page_rec_get_n_recs_before( n--; ut_ad(n >= 0); + ut_ad(n < UNIV_PAGE_SIZE / (REC_N_NEW_EXTRA_BYTES + 1)); return((ulint) n); } diff --git a/storage/xtradb/page/page0zip.c b/storage/xtradb/page/page0zip.c index 5b4f5d3b76a..3d5c5a226c7 100644 --- a/storage/xtradb/page/page0zip.c +++ b/storage/xtradb/page/page0zip.c @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 2005, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 2005, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -151,6 +151,20 @@ page_zip_empty_size( #endif /* !UNIV_HOTBACKUP */ /*************************************************************//** +Gets the number of elements in the dense page directory, +including deleted records (the free list). +@return number of elements in the dense page directory */ +UNIV_INLINE +ulint +page_zip_dir_elems( +/*===============*/ + const page_zip_des_t* page_zip) /*!< in: compressed page */ +{ + /* Exclude the page infimum and supremum from the record count. */ + return(page_dir_get_n_heap(page_zip->data) - PAGE_HEAP_NO_USER_LOW); +} + +/*************************************************************//** Gets the size of the compressed page trailer (the dense page directory), including deleted records (the free list). @return length of dense page directory, in bytes */ @@ -160,14 +174,42 @@ page_zip_dir_size( /*==============*/ const page_zip_des_t* page_zip) /*!< in: compressed page */ { - /* Exclude the page infimum and supremum from the record count. */ - ulint size = PAGE_ZIP_DIR_SLOT_SIZE - * (page_dir_get_n_heap(page_zip->data) - - PAGE_HEAP_NO_USER_LOW); - return(size); + return(PAGE_ZIP_DIR_SLOT_SIZE * page_zip_dir_elems(page_zip)); +} + +/*************************************************************//** +Gets an offset to the compressed page trailer (the dense page directory), +including deleted records (the free list). +@return offset of the dense page directory */ +UNIV_INLINE +ulint +page_zip_dir_start_offs( +/*====================*/ + const page_zip_des_t* page_zip, /*!< in: compressed page */ + ulint n_dense) /*!< in: directory size */ +{ + ut_ad(n_dense * PAGE_ZIP_DIR_SLOT_SIZE < page_zip_get_size(page_zip)); + + return(page_zip_get_size(page_zip) - n_dense * PAGE_ZIP_DIR_SLOT_SIZE); } /*************************************************************//** +Gets a pointer to the compressed page trailer (the dense page directory), +including deleted records (the free list). +@param[in] page_zip compressed page +@param[in] n_dense number of entries in the directory +@return pointer to the dense page directory */ +#define page_zip_dir_start_low(page_zip, n_dense) \ + ((page_zip)->data + page_zip_dir_start_offs(page_zip, n_dense)) +/*************************************************************//** +Gets a pointer to the compressed page trailer (the dense page directory), +including deleted records (the free list). +@param[in] page_zip compressed page +@return pointer to the dense page directory */ +#define page_zip_dir_start(page_zip) \ + page_zip_dir_start_low(page_zip, page_zip_dir_elems(page_zip)) + +/*************************************************************//** Gets the size of the compressed page trailer (the dense page directory), only including user records (excluding the free list). @return length of dense page directory comprising existing records, in bytes */ @@ -653,13 +695,13 @@ page_zip_dir_encode( Allocate memory for zlib. */ static void* -page_zip_malloc( +page_zip_zalloc( /*============*/ void* opaque, /*!< in/out: memory heap */ uInt items, /*!< in: number of items to allocate */ uInt size) /*!< in: size of an item in bytes */ { - return(mem_heap_alloc(opaque, items * size)); + return(mem_heap_zalloc(opaque, items * size)); } /**********************************************************************//** @@ -684,7 +726,7 @@ page_zip_set_alloc( { z_stream* strm = stream; - strm->zalloc = page_zip_malloc; + strm->zalloc = page_zip_zalloc; strm->zfree = page_zip_free; strm->opaque = heap; } @@ -2246,8 +2288,7 @@ zlib_done: } /* Restore the uncompressed columns in heap_no order. */ - storage = page_zip->data + page_zip_get_size(page_zip) - - n_dense * PAGE_ZIP_DIR_SLOT_SIZE; + storage = page_zip_dir_start_low(page_zip, n_dense); for (slot = 0; slot < n_dense; slot++) { rec_t* rec = recs[slot]; @@ -2732,8 +2773,7 @@ zlib_done: return(FALSE); } - storage = page_zip->data + page_zip_get_size(page_zip) - - n_dense * PAGE_ZIP_DIR_SLOT_SIZE; + storage = page_zip_dir_start_low(page_zip, n_dense); externs = storage - n_dense * (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); @@ -2916,19 +2956,18 @@ zlib_error: page_zip_set_alloc(&d_stream, heap); - if (UNIV_UNLIKELY(inflateInit2(&d_stream, UNIV_PAGE_SIZE_SHIFT) - != Z_OK)) { - ut_error; - } - d_stream.next_in = page_zip->data + PAGE_DATA; /* Subtract the space reserved for the page header and the end marker of the modification log. */ d_stream.avail_in = page_zip_get_size(page_zip) - (PAGE_DATA + 1); - d_stream.next_out = page + PAGE_ZIP_START; d_stream.avail_out = UNIV_PAGE_SIZE - PAGE_ZIP_START; + if (UNIV_UNLIKELY(inflateInit2(&d_stream, UNIV_PAGE_SIZE_SHIFT) + != Z_OK)) { + ut_error; + } + /* Decode the zlib header and the index information. */ if (UNIV_UNLIKELY(inflate(&d_stream, Z_BLOCK) != Z_OK)) { @@ -3462,9 +3501,7 @@ page_zip_write_rec( } /* Write the data bytes. Store the uncompressed bytes separately. */ - storage = page_zip->data + page_zip_get_size(page_zip) - - (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW) - * PAGE_ZIP_DIR_SLOT_SIZE; + storage = page_zip_dir_start(page_zip); if (page_is_leaf(page)) { ulint len; @@ -3760,9 +3797,7 @@ corrupt: field = page + offset; storage = page_zip->data + z_offset; - storage_end = page_zip->data + page_zip_get_size(page_zip) - - (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW) - * PAGE_ZIP_DIR_SLOT_SIZE; + storage_end = page_zip_dir_start(page_zip); heap_no = 1 + (storage_end - storage) / REC_NODE_PTR_SIZE; @@ -3798,7 +3833,9 @@ page_zip_write_node_ptr( { byte* field; byte* storage; +#ifdef UNIV_DEBUG page_t* page = page_align(rec); +#endif /* UNIV_DEBUG */ ut_ad(PAGE_ZIP_MATCH(rec, page_zip)); ut_ad(page_simple_validate_new(page)); @@ -3815,9 +3852,7 @@ page_zip_write_node_ptr( UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); UNIV_MEM_ASSERT_RW(rec, size); - storage = page_zip->data + page_zip_get_size(page_zip) - - (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW) - * PAGE_ZIP_DIR_SLOT_SIZE + storage = page_zip_dir_start(page_zip) - (rec_get_heap_no_new(rec) - 1) * REC_NODE_PTR_SIZE; field = rec + size - REC_NODE_PTR_SIZE; @@ -3866,7 +3901,9 @@ page_zip_write_trx_id_and_roll_ptr( { byte* field; byte* storage; +#ifdef UNIV_DEBUG page_t* page = page_align(rec); +#endif /* UNIV_DEBUG */ ulint len; ut_ad(PAGE_ZIP_MATCH(rec, page_zip)); @@ -3884,9 +3921,7 @@ page_zip_write_trx_id_and_roll_ptr( UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); - storage = page_zip->data + page_zip_get_size(page_zip) - - (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW) - * PAGE_ZIP_DIR_SLOT_SIZE + storage = page_zip_dir_start(page_zip) - (rec_get_heap_no_new(rec) - 1) * (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); @@ -3917,17 +3952,9 @@ page_zip_write_trx_id_and_roll_ptr( UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); } -#ifdef UNIV_ZIP_DEBUG -/** Set this variable in a debugger to disable page_zip_clear_rec(). -The only observable effect should be the compression ratio due to -deleted records not being zeroed out. In rare cases, there can be -page_zip_validate() failures on the node_ptr, trx_id and roll_ptr -columns if the space is reallocated for a smaller record. */ -UNIV_INTERN ibool page_zip_clear_rec_disable; -#endif /* UNIV_ZIP_DEBUG */ - /**********************************************************************//** -Clear an area on the uncompressed and compressed page, if possible. */ +Clear an area on the uncompressed and compressed page. +Do not clear the data payload, as that would grow the modification log. */ static void page_zip_clear_rec( @@ -3939,6 +3966,9 @@ page_zip_clear_rec( { ulint heap_no; page_t* page = page_align(rec); + byte* storage; + byte* field; + ulint len; /* page_zip_validate() would fail here if a record containing externally stored columns is being deleted. */ ut_ad(rec_offs_validate(rec, index, offsets)); @@ -3954,60 +3984,38 @@ page_zip_clear_rec( UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets), rec_offs_extra_size(offsets)); - if ( -#ifdef UNIV_ZIP_DEBUG - !page_zip_clear_rec_disable && -#endif /* UNIV_ZIP_DEBUG */ - page_zip->m_end - + 1 + ((heap_no - 1) >= 64)/* size of the log entry */ - + page_zip_get_trailer_len(page_zip, - dict_index_is_clust(index), NULL) - < page_zip_get_size(page_zip)) { - byte* data; - - /* Clear only the data bytes, because the allocator and - the decompressor depend on the extra bytes. */ - memset(rec, 0, rec_offs_data_size(offsets)); - - if (!page_is_leaf(page)) { - /* Clear node_ptr on the compressed page. */ - byte* storage = page_zip->data - + page_zip_get_size(page_zip) - - (page_dir_get_n_heap(page) - - PAGE_HEAP_NO_USER_LOW) - * PAGE_ZIP_DIR_SLOT_SIZE; - - memset(storage - (heap_no - 1) * REC_NODE_PTR_SIZE, - 0, REC_NODE_PTR_SIZE); - } else if (dict_index_is_clust(index)) { - /* Clear trx_id and roll_ptr on the compressed page. */ - byte* storage = page_zip->data - + page_zip_get_size(page_zip) - - (page_dir_get_n_heap(page) - - PAGE_HEAP_NO_USER_LOW) - * PAGE_ZIP_DIR_SLOT_SIZE; - - memset(storage - (heap_no - 1) - * (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN), - 0, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); - } + if (!page_is_leaf(page)) { + /* Clear node_ptr. On the compressed page, + there is an array of node_ptr immediately before the + dense page directory, at the very end of the page. */ + storage = page_zip_dir_start(page_zip); + ut_ad(dict_index_get_n_unique_in_tree(index) == + rec_offs_n_fields(offsets) - 1); + field = rec_get_nth_field(rec, offsets, + rec_offs_n_fields(offsets) - 1, + &len); + ut_ad(len == REC_NODE_PTR_SIZE); - /* Log that the data was zeroed out. */ - data = page_zip->data + page_zip->m_end; - ut_ad(!*data); - if (UNIV_UNLIKELY(heap_no - 1 >= 64)) { - *data++ = (byte) (0x80 | (heap_no - 1) >> 7); - ut_ad(!*data); - } - *data++ = (byte) ((heap_no - 1) << 1 | 1); - ut_ad(!*data); - ut_ad((ulint) (data - page_zip->data) - < page_zip_get_size(page_zip)); - page_zip->m_end = data - page_zip->data; - page_zip->m_nonempty = TRUE; - } else if (page_is_leaf(page) && dict_index_is_clust(index)) { - /* Do not clear the record, because there is not enough space - to log the operation. */ + ut_ad(!rec_offs_any_extern(offsets)); + memset(field, 0, REC_NODE_PTR_SIZE); + memset(storage - (heap_no - 1) * REC_NODE_PTR_SIZE, + 0, REC_NODE_PTR_SIZE); + } else if (dict_index_is_clust(index)) { + /* Clear trx_id and roll_ptr. On the compressed page, + there is an array of these fields immediately before the + dense page directory, at the very end of the page. */ + const ulint trx_id_pos + = dict_col_get_clust_pos( + dict_table_get_sys_col( + index->table, DATA_TRX_ID), index); + storage = page_zip_dir_start(page_zip); + field = rec_get_nth_field(rec, offsets, trx_id_pos, &len); + ut_ad(len == DATA_TRX_ID_LEN); + + memset(field, 0, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + memset(storage - (heap_no - 1) + * (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN), + 0, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); if (rec_offs_any_extern(offsets)) { ulint i; @@ -4016,15 +4024,18 @@ page_zip_clear_rec( /* Clear all BLOB pointers in order to make page_zip_validate() pass. */ if (rec_offs_nth_extern(offsets, i)) { - ulint len; - byte* field = rec_get_nth_field( + field = rec_get_nth_field( rec, offsets, i, &len); + ut_ad(len + == BTR_EXTERN_FIELD_REF_SIZE); memset(field + len - BTR_EXTERN_FIELD_REF_SIZE, 0, BTR_EXTERN_FIELD_REF_SIZE); } } } + } else { + ut_ad(!rec_offs_any_extern(offsets)); } #ifdef UNIV_ZIP_DEBUG @@ -4455,6 +4466,8 @@ page_zip_reorganize( /* Copy the old page to temporary space */ buf_frame_copy(temp_page, page); + btr_blob_dbg_remove(page, index, "zip_reorg"); + /* Recreate the page: note that global data on page (possible segment headers, next page-field, etc.) is preserved intact */ @@ -4513,7 +4526,7 @@ page_zip_copy_recs( mtr_t* mtr) /*!< in: mini-transaction */ { ut_ad(mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_X_FIX)); - ut_ad(mtr_memo_contains_page(mtr, (page_t*) src, MTR_MEMO_PAGE_X_FIX)); + ut_ad(mtr_memo_contains_page(mtr, src, MTR_MEMO_PAGE_X_FIX)); ut_ad(!dict_index_is_ibuf(index)); #ifdef UNIV_ZIP_DEBUG /* The B-tree operations that call this function may set @@ -4583,6 +4596,7 @@ page_zip_copy_recs( #ifdef UNIV_ZIP_DEBUG ut_a(page_zip_validate(page_zip, page)); #endif /* UNIV_ZIP_DEBUG */ + btr_blob_dbg_add(page, index, "page_zip_copy_recs"); page_zip_compress_write_log(page_zip, page, index, mtr); } diff --git a/storage/xtradb/que/que0que.c b/storage/xtradb/que/que0que.c index 9c1d61c1731..5fccbb180fe 100644 --- a/storage/xtradb/que/que0que.c +++ b/storage/xtradb/que/que0que.c @@ -1418,6 +1418,12 @@ que_eval_sql( ut_a(trx->error_state == DB_SUCCESS); + if (trx->fake_changes) { + /* fake_changes should not access to system tables */ + fprintf(stderr, "InnoDB: ERROR: innodb_fake_changes tried to access to system tables.\n"); + return(DB_ERROR); + } + if (reserve_dict_mutex) { mutex_enter(&dict_sys->mutex); } diff --git a/storage/xtradb/rem/rem0rec.c b/storage/xtradb/rem/rem0rec.c index 37ba8ca2ffe..9f90d2940dd 100644 --- a/storage/xtradb/rem/rem0rec.c +++ b/storage/xtradb/rem/rem0rec.c @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved. +Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -408,7 +408,7 @@ rec_init_offsets( do { ulint len; if (UNIV_UNLIKELY(i == n_node_ptr_field)) { - len = offs += 4; + len = offs += REC_NODE_PTR_SIZE; goto resolved; } @@ -640,7 +640,7 @@ rec_get_offsets_reverse( do { ulint len; if (UNIV_UNLIKELY(i == n_node_ptr_field)) { - len = offs += 4; + len = offs += REC_NODE_PTR_SIZE; goto resolved; } @@ -1131,9 +1131,9 @@ rec_convert_dtuple_to_rec_comp( if (UNIV_UNLIKELY(i == n_node_ptr_field)) { ut_ad(dtype_get_prtype(type) & DATA_NOT_NULL); - ut_ad(len == 4); + ut_ad(len == REC_NODE_PTR_SIZE); memcpy(end, dfield_get_data(field), len); - end += 4; + end += REC_NODE_PTR_SIZE; break; } diff --git a/storage/xtradb/row/row0ins.c b/storage/xtradb/row/row0ins.c index efcea62f212..0e62185c02e 100644 --- a/storage/xtradb/row/row0ins.c +++ b/storage/xtradb/row/row0ins.c @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved. +Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -345,9 +345,9 @@ row_ins_clust_index_entry_by_modify( return(DB_LOCK_TABLE_FULL); } - err = btr_cur_pessimistic_update(0, cursor, - heap, big_rec, update, - 0, thr, mtr); + err = btr_cur_pessimistic_update( + BTR_KEEP_POS_FLAG, cursor, heap, big_rec, update, + 0, thr, mtr); } return(err); @@ -1512,6 +1512,11 @@ exit_func: if (UNIV_LIKELY_NULL(heap)) { mem_heap_free(heap); } + + if (trx->fake_changes) { + err = DB_SUCCESS; + } + return(err); } @@ -1992,6 +1997,7 @@ row_ins_index_entry_low( ulint modify = 0; /* remove warning */ rec_t* insert_rec; rec_t* rec; + ulint* offsets; ulint err; ulint n_unique; big_rec_t* big_rec = NULL; @@ -2013,7 +2019,7 @@ row_ins_index_entry_low( } btr_cur_search_to_nth_level(index, 0, entry, PAGE_CUR_LE, - mode | BTR_INSERT | ignore_sec_unique, + thr_get_trx(thr)->fake_changes ? BTR_SEARCH_LEAF : (mode | BTR_INSERT | ignore_sec_unique), &cursor, 0, __FILE__, __LINE__, &mtr); if (cursor.flag == BTR_CUR_INSERT_TO_IBUF) { @@ -2073,7 +2079,7 @@ row_ins_index_entry_low( btr_cur_search_to_nth_level(index, 0, entry, PAGE_CUR_LE, - mode | BTR_INSERT, + thr_get_trx(thr)->fake_changes ? BTR_SEARCH_LEAF : (mode | BTR_INSERT), &cursor, 0, __FILE__, __LINE__, &mtr); } @@ -2097,6 +2103,42 @@ row_ins_index_entry_low( err = row_ins_clust_index_entry_by_modify( mode, &cursor, &heap, &big_rec, entry, thr, &mtr); + + if (big_rec) { + ut_a(err == DB_SUCCESS); + /* Write out the externally stored + columns while still x-latching + index->lock and block->lock. We have + to mtr_commit(mtr) first, so that the + redo log will be written in the + correct order. Otherwise, we would run + into trouble on crash recovery if mtr + freed B-tree pages on which some of + the big_rec fields will be written. */ + btr_cur_mtr_commit_and_start(&cursor, &mtr); + + rec = btr_cur_get_rec(&cursor); + offsets = rec_get_offsets( + rec, index, NULL, + ULINT_UNDEFINED, &heap); + + err = btr_store_big_rec_extern_fields( + index, btr_cur_get_block(&cursor), + rec, offsets, &mtr, FALSE, big_rec); + /* If writing big_rec fails (for + example, because of DB_OUT_OF_FILE_SPACE), + the record will be corrupted. Even if + we did not update any externally + stored columns, our update could cause + the record to grow so that a + non-updated column was selected for + external storage. This non-update + would not have been written to the + undo log, and thus the record cannot + be rolled back. */ + ut_a(err == DB_SUCCESS); + goto stored_big_rec; + } } else { ut_ad(!n_ext); err = row_ins_sec_index_entry_by_modify( @@ -2125,8 +2167,22 @@ function_exit: mtr_commit(&mtr); if (UNIV_LIKELY_NULL(big_rec)) { - rec_t* rec; - ulint* offsets; + + if (thr_get_trx(thr)->fake_changes) { + /* skip store extern */ + if (modify) { + dtuple_big_rec_free(big_rec); + } else { + dtuple_convert_back_big_rec(index, entry, big_rec); + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + return(err); + } + mtr_start(&mtr); btr_cur_search_to_nth_level(index, 0, entry, PAGE_CUR_LE, @@ -2140,6 +2196,7 @@ function_exit: index, btr_cur_get_block(&cursor), rec, offsets, &mtr, FALSE, big_rec); +stored_big_rec: if (modify) { dtuple_big_rec_free(big_rec); } else { diff --git a/storage/xtradb/row/row0mysql.c b/storage/xtradb/row/row0mysql.c index 1b97cbb0009..cb003ad62e0 100644 --- a/storage/xtradb/row/row0mysql.c +++ b/storage/xtradb/row/row0mysql.c @@ -1190,6 +1190,7 @@ run_again: prebuilt->table->stat_n_rows--; } + if (!(trx->fake_changes)) row_update_statistics_if_needed(prebuilt->table); trx->op_info = ""; @@ -1450,6 +1451,7 @@ run_again: that changes indexed columns, UPDATEs that change only non-indexed columns would not affect statistics. */ if (node->is_delete || !(node->cmpl_info & UPD_NODE_NO_ORD_CHANGE)) { + if (!(trx->fake_changes)) row_update_statistics_if_needed(prebuilt->table); } @@ -1668,6 +1670,7 @@ run_again: srv_n_rows_updated++; } + if (!(trx->fake_changes)) row_update_statistics_if_needed(table); return(err); @@ -2556,10 +2559,29 @@ row_discard_tablespace_for_mysql( err = DB_ERROR; } else { + dict_index_t* index; + /* Set the flag which tells that now it is legal to IMPORT a tablespace for this table */ table->tablespace_discarded = TRUE; table->ibd_file_missing = TRUE; + + /* check adaptive hash entries */ + index = dict_table_get_first_index(table); + while (index) { + ulint ref_count = btr_search_info_get_ref_count(index->search_info); + if (ref_count) { + fprintf(stderr, "InnoDB: Warning:" + " hash index ref_count (%lu) is not zero" + " after fil_discard_tablespace().\n" + "index: \"%s\"" + " table: \"%s\"\n", + ref_count, + index->name, + table->name); + } + index = dict_table_get_next_index(index); + } } } @@ -2597,6 +2619,11 @@ row_import_tablespace_for_mysql( current_lsn = log_get_lsn(); + /* Enlarge the fatal lock wait timeout during import. */ + mutex_enter(&kernel_mutex); + srv_fatal_semaphore_wait_threshold += 7200; /* 2 hours */ + mutex_exit(&kernel_mutex); + /* It is possible, though very improbable, that the lsn's in the tablespace to be imported have risen above the current system lsn, if a lengthy purge, ibuf merge, or rollback was performed on a backup @@ -2708,6 +2735,11 @@ funct_exit: trx->op_info = ""; + /* Restore the fatal semaphore wait timeout */ + mutex_enter(&kernel_mutex); + srv_fatal_semaphore_wait_threshold -= 7200; /* 2 hours */ + mutex_exit(&kernel_mutex); + return((int) err); } @@ -2901,6 +2933,19 @@ row_truncate_table_for_mysql( table->space = space; index = dict_table_get_first_index(table); do { + ulint ref_count = btr_search_info_get_ref_count(index->search_info); + /* check adaptive hash entries */ + if (ref_count) { + fprintf(stderr, "InnoDB: Warning:" + " hash index ref_count (%lu) is not zero" + " after fil_discard_tablespace().\n" + "index: \"%s\"" + " table: \"%s\"\n", + ref_count, + index->name, + table->name); + } + index->space = space; index = dict_table_get_next_index(index); } while (index); diff --git a/storage/xtradb/row/row0row.c b/storage/xtradb/row/row0row.c index 0783d482f76..cea70e98dee 100644 --- a/storage/xtradb/row/row0row.c +++ b/storage/xtradb/row/row0row.c @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved. +Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -47,35 +47,6 @@ Created 4/20/1996 Heikki Tuuri #include "read0read.h" #include "ut0mem.h" -/*********************************************************************//** -Gets the offset of trx id field, in bytes relative to the origin of -a clustered index record. -@return offset of DATA_TRX_ID */ -UNIV_INTERN -ulint -row_get_trx_id_offset( -/*==================*/ - const rec_t* rec __attribute__((unused)), - /*!< in: record */ - dict_index_t* index, /*!< in: clustered index */ - const ulint* offsets)/*!< in: rec_get_offsets(rec, index) */ -{ - ulint pos; - ulint offset; - ulint len; - - ut_ad(dict_index_is_clust(index)); - ut_ad(rec_offs_validate(rec, index, offsets)); - - pos = dict_index_get_sys_col_pos(index, DATA_TRX_ID); - - offset = rec_get_nth_field_offs(offsets, pos, &len); - - ut_ad(len == DATA_TRX_ID_LEN); - - return(offset); -} - /*****************************************************************//** When an insert or purge to a table is performed, this function builds the entry to be inserted into or purged from an index on the table. @@ -130,12 +101,27 @@ row_build_index_entry( dfield_copy(dfield, dfield2); - if (dfield_is_null(dfield) || ind_field->prefix_len == 0) { + if (dfield_is_null(dfield)) { + continue; + } + + if (ind_field->prefix_len == 0 + && (!dfield_is_ext(dfield) + || dict_index_is_clust(index))) { + /* The dfield_copy() above suffices for + columns that are stored in-page, or for + clustered index record columns that are not + part of a column prefix in the PRIMARY KEY. */ continue; } - /* If a column prefix index, take only the prefix. - Prefix-indexed columns may be externally stored. */ + /* If the column is stored externally (off-page) in + the clustered index, it must be an ordering field in + the secondary index. In the Antelope format, only + prefix-indexed columns may be stored off-page in the + clustered index record. In the Barracuda format, also + fully indexed long CHAR or VARCHAR columns may be + stored off-page. */ ut_ad(col->ord_part); if (UNIV_LIKELY_NULL(ext)) { @@ -148,17 +134,41 @@ row_build_index_entry( } dfield_set_data(dfield, buf, len); } + + if (ind_field->prefix_len == 0) { + /* In the Barracuda format + (ROW_FORMAT=DYNAMIC or + ROW_FORMAT=COMPRESSED), we can have a + secondary index on an entire column + that is stored off-page in the + clustered index. As this is not a + prefix index (prefix_len == 0), + include the entire off-page column in + the secondary index record. */ + continue; + } } else if (dfield_is_ext(dfield)) { + /* This table is either in Antelope format + (ROW_FORMAT=REDUNDANT or ROW_FORMAT=COMPACT) + or a purge record where the ordered part of + the field is not external. + In Antelope, the maximum column prefix + index length is 767 bytes, and the clustered + index record contains a 768-byte prefix of + each off-page column. */ ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE); len -= BTR_EXTERN_FIELD_REF_SIZE; - ut_a(ind_field->prefix_len <= len - || dict_index_is_clust(index)); + dfield_set_len(dfield, len); } - len = dtype_get_at_most_n_mbchars( - col->prtype, col->mbminlen, col->mbmaxlen, - ind_field->prefix_len, len, dfield_get_data(dfield)); - dfield_set_len(dfield, len); + /* If a column prefix index, take only the prefix. */ + if (ind_field->prefix_len) { + len = dtype_get_at_most_n_mbchars( + col->prtype, col->mbminlen, col->mbmaxlen, + ind_field->prefix_len, len, + dfield_get_data(dfield)); + dfield_set_len(dfield, len); + } } ut_ad(dtuple_check_typed(entry)); @@ -223,6 +233,7 @@ row_build( ut_ad(index && rec && heap); ut_ad(dict_index_is_clust(index)); + ut_ad(!mutex_own(&kernel_mutex)); if (!offsets) { offsets = rec_get_offsets(rec, index, offsets_, @@ -231,6 +242,22 @@ row_build( ut_ad(rec_offs_validate(rec, index, offsets)); } +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG + /* This condition can occur during crash recovery before + trx_rollback_active() has completed execution. + + This condition is possible if the server crashed + during an insert or update before + btr_store_big_rec_extern_fields() did mtr_commit() all + BLOB pointers to the clustered index record. + + If the record contains a null BLOB pointer, look up the + transaction that holds the implicit lock on this record, and + assert that it was recovered (and will soon be rolled back). */ + ut_a(!rec_offs_any_null_extern(rec, offsets) + || trx_assert_recovered(row_get_rec_trx_id(rec, index, offsets))); +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ + if (type != ROW_COPY_POINTERS) { /* Take a copy of rec to heap */ buf = mem_heap_alloc(heap, rec_offs_size(offsets)); @@ -431,6 +458,10 @@ row_rec_to_index_entry( rec = rec_copy(buf, rec, offsets); /* Avoid a debug assertion in rec_offs_validate(). */ rec_offs_make_valid(rec, index, offsets); +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG + } else { + ut_a(!rec_offs_any_null_extern(rec, offsets)); +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ } entry = row_rec_to_index_entry_low(rec, index, offsets, n_ext, heap); diff --git a/storage/xtradb/row/row0sel.c b/storage/xtradb/row/row0sel.c index 1ced125b7bd..e8e9465d838 100644 --- a/storage/xtradb/row/row0sel.c +++ b/storage/xtradb/row/row0sel.c @@ -2544,6 +2544,8 @@ row_sel_field_store_in_mysql_format( ut_ad(len != UNIV_SQL_NULL); UNIV_MEM_ASSERT_RW(data, len); + UNIV_MEM_ASSERT_W(dest, templ->mysql_col_len); + UNIV_MEM_INVALID(dest, templ->mysql_col_len); switch (templ->type) { case DATA_INT: @@ -2580,14 +2582,16 @@ row_sel_field_store_in_mysql_format( dest = row_mysql_store_true_var_len( dest, len, templ->mysql_length_bytes); + /* Copy the actual data. Leave the rest of the + buffer uninitialized. */ + memcpy(dest, data, len); + break; } /* Copy the actual data */ ut_memcpy(dest, data, len); - /* Pad with trailing spaces. We pad with spaces also the - unused end of a >= 5.0.3 true VARCHAR column, just in case - MySQL expects its contents to be deterministic. */ + /* Pad with trailing spaces. */ pad_ptr = dest + len; @@ -3120,6 +3124,39 @@ sel_restore_position_for_mysql( } /********************************************************************//** +Copies a cached field for MySQL from the fetch cache. */ +static +void +row_sel_copy_cached_field_for_mysql( +/*================================*/ + byte* buf, /*!< in/out: row buffer */ + const byte* cache, /*!< in: cached row */ + const mysql_row_templ_t*templ) /*!< in: column template */ +{ + ulint len; + + buf += templ->mysql_col_offset; + cache += templ->mysql_col_offset; + + UNIV_MEM_ASSERT_W(buf, templ->mysql_col_len); + + if (templ->mysql_type == DATA_MYSQL_TRUE_VARCHAR + && templ->type != DATA_INT) { + /* Check for != DATA_INT to make sure we do + not treat MySQL ENUM or SET as a true VARCHAR! + Find the actual length of the true VARCHAR field. */ + row_mysql_read_true_varchar( + &len, cache, templ->mysql_length_bytes); + len += templ->mysql_length_bytes; + UNIV_MEM_INVALID(buf, templ->mysql_col_len); + } else { + len = templ->mysql_col_len; + } + + ut_memcpy(buf, cache, len); +} + +/********************************************************************//** Pops a cached row for MySQL from the fetch cache. */ UNIV_INLINE void @@ -3131,26 +3168,22 @@ row_sel_pop_cached_row_for_mysql( { ulint i; const mysql_row_templ_t*templ; - byte* cached_rec; + const byte* cached_rec; ut_ad(prebuilt->n_fetch_cached > 0); ut_ad(prebuilt->mysql_prefix_len <= prebuilt->mysql_row_len); + UNIV_MEM_ASSERT_W(buf, prebuilt->mysql_row_len); + + cached_rec = prebuilt->fetch_cache[prebuilt->fetch_cache_first]; + if (UNIV_UNLIKELY(prebuilt->keep_other_fields_on_keyread)) { /* Copy cache record field by field, don't touch fields that are not covered by current key */ - cached_rec = prebuilt->fetch_cache[ - prebuilt->fetch_cache_first]; for (i = 0; i < prebuilt->n_template; i++) { templ = prebuilt->mysql_template + i; -#if 0 /* Some of the cached_rec may legitimately be uninitialized. */ - UNIV_MEM_ASSERT_RW(cached_rec - + templ->mysql_col_offset, - templ->mysql_col_len); -#endif - ut_memcpy(buf + templ->mysql_col_offset, - cached_rec + templ->mysql_col_offset, - templ->mysql_col_len); + row_sel_copy_cached_field_for_mysql( + buf, cached_rec, templ); /* Copy NULL bit of the current field from cached_rec to buf */ if (templ->mysql_null_bit_mask) { @@ -3160,17 +3193,24 @@ row_sel_pop_cached_row_for_mysql( & (byte)templ->mysql_null_bit_mask; } } + } else if (prebuilt->mysql_prefix_len > 63) { + /* The record is long. Copy it field by field, in case + there are some long VARCHAR column of which only a + small length is being used. */ + UNIV_MEM_INVALID(buf, prebuilt->mysql_prefix_len); + + /* First copy the NULL bits. */ + ut_memcpy(buf, cached_rec, prebuilt->null_bitmap_len); + /* Then copy the requested fields. */ + + for (i = 0; i < prebuilt->n_template; i++) { + row_sel_copy_cached_field_for_mysql( + buf, cached_rec, prebuilt->mysql_template + i); + } + } else { + ut_memcpy(buf, cached_rec, prebuilt->mysql_prefix_len); } - else { -#if 0 /* Some of the cached_rec may legitimately be uninitialized. */ - UNIV_MEM_ASSERT_RW(prebuilt->fetch_cache - [prebuilt->fetch_cache_first], - prebuilt->mysql_prefix_len); -#endif - ut_memcpy(buf, - prebuilt->fetch_cache[prebuilt->fetch_cache_first], - prebuilt->mysql_prefix_len); - } + prebuilt->n_fetch_cached--; prebuilt->fetch_cache_first++; @@ -3964,7 +4004,13 @@ rec_loop: if (UNIV_UNLIKELY(next_offs >= UNIV_PAGE_SIZE - PAGE_DIR)) { wrong_offs: - if (srv_force_recovery == 0 || moves_up == FALSE) { + if (srv_pass_corrupt_table && !trx_sys_sys_space(index->table->space)) { + index->table->is_corrupt = TRUE; + fil_space_set_corrupt(index->table->space); + } + + if ((srv_force_recovery == 0 || moves_up == FALSE) + && srv_pass_corrupt_table <= 1) { ut_print_timestamp(stderr); buf_page_print(page_align(rec), 0); fprintf(stderr, @@ -4015,7 +4061,8 @@ wrong_offs: offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap); - if (UNIV_UNLIKELY(srv_force_recovery > 0)) { + if (UNIV_UNLIKELY(srv_force_recovery > 0) + || (srv_pass_corrupt_table == 2 && index->table->is_corrupt)) { if (!rec_validate(rec, offsets) || !btr_index_rec_validate(rec, index, FALSE)) { fprintf(stderr, diff --git a/storage/xtradb/row/row0upd.c b/storage/xtradb/row/row0upd.c index a6fb266c4ed..5765d3b76d2 100644 --- a/storage/xtradb/row/row0upd.c +++ b/storage/xtradb/row/row0upd.c @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -504,14 +504,49 @@ row_upd_rec_in_place( n_fields = upd_get_n_fields(update); for (i = 0; i < n_fields; i++) { +#ifdef UNIV_BLOB_DEBUG + btr_blob_dbg_t b; + const byte* field_ref = NULL; +#endif /* UNIV_BLOB_DEBUG */ + upd_field = upd_get_nth_field(update, i); new_val = &(upd_field->new_val); ut_ad(!dfield_is_ext(new_val) == !rec_offs_nth_extern(offsets, upd_field->field_no)); +#ifdef UNIV_BLOB_DEBUG + if (dfield_is_ext(new_val)) { + ulint len; + field_ref = rec_get_nth_field(rec, offsets, i, &len); + ut_a(len != UNIV_SQL_NULL); + ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE); + field_ref += len - BTR_EXTERN_FIELD_REF_SIZE; + + b.ref_page_no = page_get_page_no(page_align(rec)); + b.ref_heap_no = page_rec_get_heap_no(rec); + b.ref_field_no = i; + b.blob_page_no = mach_read_from_4( + field_ref + BTR_EXTERN_PAGE_NO); + ut_a(b.ref_field_no >= index->n_uniq); + btr_blob_dbg_rbt_delete(index, &b, "upd_in_place"); + } +#endif /* UNIV_BLOB_DEBUG */ rec_set_nth_field(rec, offsets, upd_field->field_no, dfield_get_data(new_val), dfield_get_len(new_val)); + +#ifdef UNIV_BLOB_DEBUG + if (dfield_is_ext(new_val)) { + b.blob_page_no = mach_read_from_4( + field_ref + BTR_EXTERN_PAGE_NO); + b.always_owner = b.owner = !(field_ref[BTR_EXTERN_LEN] + & BTR_EXTERN_OWNER_FLAG); + b.del = rec_get_deleted_flag( + rec, rec_offs_comp(offsets)); + + btr_blob_dbg_rbt_insert(index, &b, "upd_in_place"); + } +#endif /* UNIV_BLOB_DEBUG */ } if (UNIV_LIKELY_NULL(page_zip)) { @@ -1556,8 +1591,9 @@ row_upd_sec_index_entry( mtr_start(&mtr); - found = row_search_index_entry(index, entry, BTR_MODIFY_LEAF, &pcur, - &mtr); + found = row_search_index_entry(index, entry, + trx->fake_changes ? BTR_SEARCH_LEAF : BTR_MODIFY_LEAF, + &pcur, &mtr); btr_cur = btr_pcur_get_btr_cur(&pcur); rec = btr_cur_get_rec(btr_cur); @@ -1787,9 +1823,11 @@ row_upd_clust_rec_by_insert( the previous invocation of this function. Mark the off-page columns in the entry inherited. */ + if (!(trx->fake_changes)) { change_ownership = row_upd_clust_rec_by_insert_inherit( NULL, NULL, entry, node->update); ut_a(change_ownership); + } /* fall through */ case UPD_NODE_INSERT_CLUSTERED: /* A lock wait occurred in row_ins_index_entry() in @@ -1819,7 +1857,7 @@ err_exit: delete-marked old record, mark them disowned by the old record and owned by the new entry. */ - if (rec_offs_any_extern(offsets)) { + if (rec_offs_any_extern(offsets) && !(trx->fake_changes)) { change_ownership = row_upd_clust_rec_by_insert_inherit( rec, offsets, entry, node->update); @@ -1947,33 +1985,50 @@ row_upd_clust_rec( the same transaction do not modify the record in the meantime. Therefore we can assert that the restoration of the cursor succeeds. */ - ut_a(btr_pcur_restore_position(BTR_MODIFY_TREE, pcur, mtr)); + ut_a(btr_pcur_restore_position(thr_get_trx(thr)->fake_changes ? BTR_SEARCH_LEAF : BTR_MODIFY_TREE, + pcur, mtr)); ut_ad(!rec_get_deleted_flag(btr_pcur_get_rec(pcur), dict_table_is_comp(index->table))); - err = btr_cur_pessimistic_update(BTR_NO_LOCKING_FLAG, btr_cur, - &heap, &big_rec, node->update, - node->cmpl_info, thr, mtr); - mtr_commit(mtr); - - if (err == DB_SUCCESS && big_rec) { + err = btr_cur_pessimistic_update( + BTR_NO_LOCKING_FLAG | BTR_KEEP_POS_FLAG, btr_cur, + &heap, &big_rec, node->update, node->cmpl_info, thr, mtr); + /* skip store extern for fake_changes */ + if (big_rec && !(thr_get_trx(thr)->fake_changes)) { ulint offsets_[REC_OFFS_NORMAL_SIZE]; rec_t* rec; rec_offs_init(offsets_); - mtr_start(mtr); + ut_a(err == DB_SUCCESS); + /* Write out the externally stored columns while still + x-latching index->lock and block->lock. We have to + mtr_commit(mtr) first, so that the redo log will be + written in the correct order. Otherwise, we would run + into trouble on crash recovery if mtr freed B-tree + pages on which some of the big_rec fields will be + written. */ + btr_cur_mtr_commit_and_start(btr_cur, mtr); - ut_a(btr_pcur_restore_position(BTR_MODIFY_TREE, pcur, mtr)); rec = btr_cur_get_rec(btr_cur); err = btr_store_big_rec_extern_fields( index, btr_cur_get_block(btr_cur), rec, rec_get_offsets(rec, index, offsets_, ULINT_UNDEFINED, &heap), mtr, TRUE, big_rec); - mtr_commit(mtr); + /* If writing big_rec fails (for example, because of + DB_OUT_OF_FILE_SPACE), the record will be corrupted. + Even if we did not update any externally stored + columns, our update could cause the record to grow so + that a non-updated column was selected for external + storage. This non-update would not have been written + to the undo log, and thus the record cannot be rolled + back. */ + ut_a(err == DB_SUCCESS); } + mtr_commit(mtr); + if (UNIV_LIKELY_NULL(heap)) { mem_heap_free(heap); } @@ -2082,7 +2137,8 @@ row_upd_clust_step( ut_a(pcur->rel_pos == BTR_PCUR_ON); - success = btr_pcur_restore_position(BTR_MODIFY_LEAF, pcur, mtr); + success = btr_pcur_restore_position(thr_get_trx(thr)->fake_changes ? BTR_SEARCH_LEAF : BTR_MODIFY_LEAF, + pcur, mtr); if (!success) { err = DB_RECORD_NOT_FOUND; diff --git a/storage/xtradb/row/row0vers.c b/storage/xtradb/row/row0vers.c index d4fde0b939b..8a7bb842293 100644 --- a/storage/xtradb/row/row0vers.c +++ b/storage/xtradb/row/row0vers.c @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 1997, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -550,6 +550,11 @@ row_vers_build_for_consistent_read( /* The view already sees this version: we can copy it to in_heap and return */ +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG + ut_a(!rec_offs_any_null_extern( + version, *offsets)); +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ + buf = mem_heap_alloc(in_heap, rec_offs_size(*offsets)); *old_vers = rec_copy(buf, version, *offsets); @@ -583,6 +588,10 @@ row_vers_build_for_consistent_read( *offsets = rec_get_offsets(prev_version, index, *offsets, ULINT_UNDEFINED, offset_heap); +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG + ut_a(!rec_offs_any_null_extern(prev_version, *offsets)); +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ + trx_id = row_get_rec_trx_id(prev_version, index, *offsets); if (read_view_sees_trx_id(view, trx_id)) { @@ -682,6 +691,10 @@ row_vers_build_for_semi_consistent_read( /* We found a version that belongs to a committed transaction: return it. */ +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG + ut_a(!rec_offs_any_null_extern(version, *offsets)); +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ + if (rec == version) { *old_vers = rec; err = DB_SUCCESS; @@ -739,6 +752,9 @@ row_vers_build_for_semi_consistent_read( version = prev_version; *offsets = rec_get_offsets(version, index, *offsets, ULINT_UNDEFINED, offset_heap); +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG + ut_a(!rec_offs_any_null_extern(version, *offsets)); +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ }/* for (;;) */ if (heap) { diff --git a/storage/xtradb/srv/srv0srv.c b/storage/xtradb/srv/srv0srv.c index f39d1b8a758..2ff729efbb1 100644 --- a/storage/xtradb/srv/srv0srv.c +++ b/storage/xtradb/srv/srv0srv.c @@ -86,6 +86,11 @@ Created 10/8/1995 Heikki Tuuri #include "trx0i_s.h" #include "os0sync.h" /* for HAVE_ATOMIC_BUILTINS */ +/* prototypes of new functions added to ha_innodb.cc for kill_idle_transaction */ +ibool innobase_thd_is_idle(const void* thd); +ib_int64_t innobase_thd_get_start_time(const void* thd); +void innobase_thd_kill(void* thd); + /* prototypes for new functions added to ha_innodb.cc */ ibool innobase_get_slow_log(); @@ -100,6 +105,9 @@ UNIV_INTERN ulint srv_activity_count = 0; /* The following is the maximum allowed duration of a lock wait. */ UNIV_INTERN ulint srv_fatal_semaphore_wait_threshold = 600; +/**/ +UNIV_INTERN lint srv_kill_idle_transaction = 0; + /* How much data manipulation language (DML) statements need to be delayed, in microseconds, in order to reduce the lagging of the purge thread. */ UNIV_INTERN ulint srv_dml_needed_delay = 0; @@ -211,17 +219,15 @@ UNIV_INTERN ulint srv_buf_pool_curr_size = 0; UNIV_INTERN ulint srv_mem_pool_size = ULINT_MAX; UNIV_INTERN ulint srv_lock_table_size = ULINT_MAX; -/* key value for shm */ -UNIV_INTERN uint srv_buffer_pool_shm_key = 0; -UNIV_INTERN ibool srv_buffer_pool_shm_is_reused = FALSE; -UNIV_INTERN ibool srv_buffer_pool_shm_checksum = TRUE; - /* This parameter is deprecated. Use srv_n_io_[read|write]_threads instead. */ UNIV_INTERN ulint srv_n_file_io_threads = ULINT_MAX; UNIV_INTERN ulint srv_n_read_io_threads = ULINT_MAX; UNIV_INTERN ulint srv_n_write_io_threads = ULINT_MAX; +/* Switch to enable random read ahead. */ +UNIV_INTERN my_bool srv_random_read_ahead = FALSE; + /* The universal page size of the database */ UNIV_INTERN ulint srv_page_size_shift = 0; UNIV_INTERN ulint srv_page_size = 0; @@ -320,6 +326,9 @@ UNIV_INTERN ulint srv_buf_pool_reads = 0; /** Time in seconds between automatic buffer pool dumps */ UNIV_INTERN uint srv_auto_lru_dump = 0; +/** Whether startup should be blocked until buffer pool is fully restored */ +UNIV_INTERN ibool srv_blocking_lru_restore; + /* structure to pass status variables to MySQL */ UNIV_INTERN export_struc export_vars; @@ -2153,6 +2162,8 @@ srv_export_innodb_status(void) export_vars.innodb_buffer_pool_wait_free = srv_buf_pool_wait_free; export_vars.innodb_buffer_pool_pages_flushed = srv_buf_pool_flushed; export_vars.innodb_buffer_pool_reads = srv_buf_pool_reads; + export_vars.innodb_buffer_pool_read_ahead_rnd + = buf_pool->stat.n_ra_pages_read_rnd; export_vars.innodb_buffer_pool_read_ahead = buf_pool->stat.n_ra_pages_read; export_vars.innodb_buffer_pool_read_ahead_evicted @@ -2485,6 +2496,12 @@ srv_error_monitor_thread( ulint fatal_cnt = 0; ib_uint64_t old_lsn; ib_uint64_t new_lsn; + /* longest waiting thread for a semaphore */ + os_thread_id_t waiter = os_thread_get_curr_id(); + os_thread_id_t old_waiter = waiter; + /* the semaphore that is being waited for */ + const void* sema = NULL; + const void* old_sema = NULL; old_lsn = srv_start_lsn; @@ -2533,7 +2550,8 @@ loop: sync_arr_wake_threads_if_sema_free(); - if (sync_array_print_long_waits()) { + if (sync_array_print_long_waits(&waiter, &sema) + && sema == old_sema && os_thread_eq(waiter, old_waiter)) { fatal_cnt++; if (fatal_cnt > 10) { @@ -2548,6 +2566,38 @@ loop: } } else { fatal_cnt = 0; + old_waiter = waiter; + old_sema = sema; + } + + if (srv_kill_idle_transaction && trx_sys) { + trx_t* trx; + time_t now; +rescan_idle: + now = time(NULL); + mutex_enter(&kernel_mutex); + trx = UT_LIST_GET_FIRST(trx_sys->mysql_trx_list); + while (trx) { + if (trx->conc_state == TRX_ACTIVE + && trx->mysql_thd + && innobase_thd_is_idle(trx->mysql_thd)) { + ib_int64_t start_time; /* as stmt ID */ + + start_time = innobase_thd_get_start_time(trx->mysql_thd); + if (trx->last_stmt_start != start_time) { + trx->idle_start = now; + trx->last_stmt_start = start_time; + } else if (difftime(now, trx->idle_start) + > srv_kill_idle_transaction) { + /* kill the session */ + mutex_exit(&kernel_mutex); + innobase_thd_kill(trx->mysql_thd); + goto rescan_idle; + } + } + trx = UT_LIST_GET_NEXT(mysql_trx_list, trx); + } + mutex_exit(&kernel_mutex); } /* Flush stderr so that a database user gets the output @@ -2593,7 +2643,9 @@ srv_LRU_dump_restore_thread( os_thread_pf(os_thread_get_curr_id())); #endif - if (srv_auto_lru_dump) + /* If srv_blocking_lru_restore is TRUE, restore will be done + synchronously on startup. */ + if (srv_auto_lru_dump && !srv_blocking_lru_restore) buf_LRU_file_restore(); last_dump_time = time(NULL); diff --git a/storage/xtradb/srv/srv0start.c b/storage/xtradb/srv/srv0start.c index cef045d72e1..ba4328c80e1 100644 --- a/storage/xtradb/srv/srv0start.c +++ b/storage/xtradb/srv/srv0start.c @@ -88,6 +88,7 @@ Created 2/16/1996 Heikki Tuuri # include "thr0loc.h" # include "os0sync.h" /* for INNODB_RW_LOCKS_USE_ATOMICS */ # include "zlib.h" /* for ZLIB_VERSION */ +# include "buf0lru.h" /* for buf_LRU_file_restore() */ /** Log sequence number immediately after startup */ UNIV_INTERN ib_uint64_t srv_start_lsn; @@ -126,9 +127,9 @@ static mutex_t ios_mutex; static ulint ios; /** io_handler_thread parameters for thread identification */ -static ulint n[SRV_MAX_N_IO_THREADS + 7 + 64]; +static ulint n[SRV_MAX_N_IO_THREADS + 7 + UNIV_MAX_PARALLELISM]; /** io_handler_thread identifiers */ -static os_thread_id_t thread_ids[SRV_MAX_N_IO_THREADS + 7 + 64]; +static os_thread_id_t thread_ids[SRV_MAX_N_IO_THREADS + 7 + UNIV_MAX_PARALLELISM]; /** We use this mutex to test the return value of pthread_mutex_trylock on successful locking. HP-UX does NOT return 0, though Linux et al do. */ @@ -1200,6 +1201,12 @@ innobase_start_or_create_for_mysql(void) ); #endif +#ifdef UNIV_BLOB_DEBUG + fprintf(stderr, + "InnoDB: !!!!!!!! UNIV_BLOB_DEBUG switched on !!!!!!!!!\n" + "InnoDB: Server restart may fail with UNIV_BLOB_DEBUG\n"); +#endif /* UNIV_BLOB_DEBUG */ + #ifdef UNIV_SYNC_DEBUG fprintf(stderr, "InnoDB: !!!!!!!! UNIV_SYNC_DEBUG switched on !!!!!!!!!\n"); @@ -1744,8 +1751,6 @@ innobase_start_or_create_for_mysql(void) Note that this is not as heavy weight as it seems. At this point there will be only ONE page in the buf_LRU and there must be no page in the buf_flush list. */ - /* buffer_pool_shm should not be reused when recovery was needed. */ - if (!srv_buffer_pool_shm_is_reused) buf_pool_invalidate(); /* We always try to do a recovery, even if the database had @@ -1875,6 +1880,11 @@ innobase_start_or_create_for_mysql(void) os_thread_create(&srv_LRU_dump_restore_thread, NULL, thread_ids + 5 + SRV_MAX_N_IO_THREADS); + /* If srv_blocking_lru_restore is TRUE, load buffer pool contents + synchronously */ + if (srv_auto_lru_dump && srv_blocking_lru_restore) + buf_LRU_file_restore(); + srv_is_being_started = FALSE; if (trx_doublewrite == NULL) { diff --git a/storage/xtradb/sync/sync0arr.c b/storage/xtradb/sync/sync0arr.c index 57a288089c7..4e788b4a968 100644 --- a/storage/xtradb/sync/sync0arr.c +++ b/storage/xtradb/sync/sync0arr.c @@ -913,8 +913,10 @@ Prints warnings of long semaphore waits to stderr. @return TRUE if fatal semaphore wait threshold was exceeded */ UNIV_INTERN ibool -sync_array_print_long_waits(void) -/*=============================*/ +sync_array_print_long_waits( +/*========================*/ + os_thread_id_t* waiter, /*!< out: longest waiting thread */ + const void** sema) /*!< out: longest-waited-for semaphore */ { sync_cell_t* cell; ibool old_val; @@ -922,24 +924,40 @@ sync_array_print_long_waits(void) ulint i; ulint fatal_timeout = srv_fatal_semaphore_wait_threshold; ibool fatal = FALSE; + double longest_diff = 0; for (i = 0; i < sync_primary_wait_array->n_cells; i++) { + double diff; + void* wait_object; + cell = sync_array_get_nth_cell(sync_primary_wait_array, i); - if (cell->wait_object != NULL && cell->waiting - && difftime(time(NULL), cell->reservation_time) > 240) { + wait_object = cell->wait_object; + + if (wait_object == NULL || !cell->waiting) { + + continue; + } + + diff = difftime(time(NULL), cell->reservation_time); + + if (diff > 240) { fputs("InnoDB: Warning: a long semaphore wait:\n", stderr); sync_array_cell_print(stderr, cell); noticed = TRUE; } - if (cell->wait_object != NULL && cell->waiting - && difftime(time(NULL), cell->reservation_time) - > fatal_timeout) { + if (diff > fatal_timeout) { fatal = TRUE; } + + if (diff > longest_diff) { + longest_diff = diff; + *sema = wait_object; + *waiter = cell->thread; + } } if (noticed) { diff --git a/storage/xtradb/sync/sync0rw.c b/storage/xtradb/sync/sync0rw.c index 9431de15fda..fe000e7d008 100644 --- a/storage/xtradb/sync/sync0rw.c +++ b/storage/xtradb/sync/sync0rw.c @@ -261,6 +261,9 @@ rw_lock_create_func( contains garbage at initialization and cannot be used for recursive x-locking. */ lock->recursive = FALSE; + /* Silence Valgrind when UNIV_DEBUG_VALGRIND is not enabled. */ + memset((void*) &lock->writer_thread, 0, sizeof lock->writer_thread); + UNIV_MEM_INVALID(&lock->writer_thread, sizeof lock->writer_thread); #ifdef UNIV_SYNC_DEBUG UT_LIST_INIT(lock->debug_list); @@ -762,7 +765,9 @@ rw_lock_add_debug_info( rw_lock_debug_mutex_exit(); if ((pass == 0) && (lock_type != RW_LOCK_WAIT_EX)) { - sync_thread_add_level(lock, lock->level); + sync_thread_add_level(lock, lock->level, + lock_type == RW_LOCK_EX + && lock->lock_word < 0); } } diff --git a/storage/xtradb/sync/sync0sync.c b/storage/xtradb/sync/sync0sync.c index 3a80da9318b..277a53e4fb2 100644 --- a/storage/xtradb/sync/sync0sync.c +++ b/storage/xtradb/sync/sync0sync.c @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved. +Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2008, Google Inc. Portions of this file contain modifications contributed and copyrighted by @@ -668,7 +668,7 @@ mutex_set_debug_info( ut_ad(mutex); ut_ad(file_name); - sync_thread_add_level(mutex, mutex->level); + sync_thread_add_level(mutex, mutex->level, FALSE); mutex->file_name = file_name; mutex->line = line; @@ -1094,8 +1094,9 @@ void sync_thread_add_level( /*==================*/ void* latch, /*!< in: pointer to a mutex or an rw-lock */ - ulint level) /*!< in: level in the latching order; if + ulint level, /*!< in: level in the latching order; if SYNC_LEVEL_VARYING, nothing is done */ + ibool relock) /*!< in: TRUE if re-entering an x-lock */ { sync_level_t* array; sync_level_t* slot; @@ -1143,6 +1144,10 @@ sync_thread_add_level( array = thread_slot->levels; + if (relock) { + goto levels_ok; + } + /* NOTE that there is a problem with _NODE and _LEAF levels: if the B-tree height changes, then a leaf can change to an internal node or the other way around. We do not know at present if this can cause @@ -1287,6 +1292,7 @@ sync_thread_add_level( ut_error; } +levels_ok: for (i = 0; i < SYNC_THREAD_N_LEVELS; i++) { slot = sync_thread_levels_get_nth(array, i); diff --git a/storage/xtradb/trx/trx0i_s.c b/storage/xtradb/trx/trx0i_s.c index e148234888b..5cc9df2d5c4 100644 --- a/storage/xtradb/trx/trx0i_s.c +++ b/storage/xtradb/trx/trx0i_s.c @@ -504,7 +504,7 @@ fill_trx_row( query[stmt_len] = '\0'; row->trx_query = ha_storage_put_memlim( - cache->storage, stmt, stmt_len + 1, + cache->storage, query, stmt_len + 1, MAX_ALLOWED_FOR_STORAGE(cache)); row->trx_query_cs = innobase_get_charset(trx->mysql_thd); diff --git a/storage/xtradb/trx/trx0rec.c b/storage/xtradb/trx/trx0rec.c index 71629f01d73..a7a393d31c8 100644 --- a/storage/xtradb/trx/trx0rec.c +++ b/storage/xtradb/trx/trx0rec.c @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved. +Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -1590,6 +1590,10 @@ trx_undo_prev_version_build( return(DB_ERROR); } +# if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG + ut_a(!rec_offs_any_null_extern(rec, offsets)); +# endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ + if (row_upd_changes_field_size_or_external(index, offsets, update)) { ulint n_ext; diff --git a/storage/xtradb/trx/trx0sys.c b/storage/xtradb/trx/trx0sys.c index 566f7c95793..6a15d4261eb 100644 --- a/storage/xtradb/trx/trx0sys.c +++ b/storage/xtradb/trx/trx0sys.c @@ -37,6 +37,7 @@ Created 3/26/1996 Heikki Tuuri #include "trx0rseg.h" #include "trx0undo.h" #include "srv0srv.h" +#include "srv0start.h" #include "trx0purge.h" #include "log0log.h" #include "log0recv.h" @@ -1872,10 +1873,12 @@ void trx_sys_close(void) /*===============*/ { + trx_t* trx; trx_rseg_t* rseg; read_view_t* view; ut_ad(trx_sys != NULL); + ut_ad(srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS); /* Check that all read views are closed except read view owned by a purge. */ @@ -1907,6 +1910,13 @@ trx_sys_close(void) mem_free(trx_doublewrite); trx_doublewrite = NULL; + /* Only prepared transactions may be left in the system. Free them. */ + ut_a(UT_LIST_GET_LEN(trx_sys->trx_list) == trx_n_prepared); + + while ((trx = UT_LIST_GET_FIRST(trx_sys->trx_list)) != NULL) { + trx_free_prepared(trx); + } + /* There can't be any active transactions. */ rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list); diff --git a/storage/xtradb/trx/trx0trx.c b/storage/xtradb/trx/trx0trx.c index 98bd9e4ac58..bf042db4972 100644 --- a/storage/xtradb/trx/trx0trx.c +++ b/storage/xtradb/trx/trx0trx.c @@ -50,6 +50,9 @@ UNIV_INTERN sess_t* trx_dummy_sess = NULL; /** Number of transactions currently allocated for MySQL: protected by the kernel mutex */ UNIV_INTERN ulint trx_n_mysql_transactions = 0; +/* Number of transactions currently in the XA PREPARED state: protected by +the kernel mutex */ +UNIV_INTERN ulint trx_n_prepared = 0; /*************************************************************//** Set detailed error message for the transaction. */ @@ -111,6 +114,8 @@ trx_create( trx->flush_log_at_trx_commit_session = 3; /* means to use innodb_flush_log_at_trx_commit value */ + trx->fake_changes = FALSE; + trx->check_foreigns = TRUE; trx->check_unique_secondary = TRUE; @@ -134,6 +139,9 @@ trx_create( trx->mysql_relay_log_file_name = ""; trx->mysql_relay_log_pos = 0; + trx->idle_start = 0; + trx->last_stmt_start = 0; + mutex_create(&trx->undo_mutex, SYNC_TRX_UNDO); trx->rseg = NULL; @@ -354,6 +362,60 @@ trx_free( } /********************************************************************//** +At shutdown, frees a transaction object that is in the PREPARED state. */ +UNIV_INTERN +void +trx_free_prepared( +/*==============*/ + trx_t* trx) /*!< in, own: trx object */ +{ + ut_ad(mutex_own(&kernel_mutex)); + ut_a(trx->conc_state == TRX_PREPARED); + ut_a(trx->magic_n == TRX_MAGIC_N); + + /* Prepared transactions are sort of active; they allow + ROLLBACK and COMMIT operations. Because the system does not + contain any other transactions than prepared transactions at + the shutdown stage and because a transaction cannot become + PREPARED while holding locks, it is safe to release the locks + held by PREPARED transactions here at shutdown.*/ + lock_release_off_kernel(trx); + + trx_undo_free_prepared(trx); + + mutex_free(&trx->undo_mutex); + + if (trx->undo_no_arr) { + trx_undo_arr_free(trx->undo_no_arr); + } + + ut_a(UT_LIST_GET_LEN(trx->signals) == 0); + ut_a(UT_LIST_GET_LEN(trx->reply_signals) == 0); + + ut_a(trx->wait_lock == NULL); + ut_a(UT_LIST_GET_LEN(trx->wait_thrs) == 0); + + ut_a(!trx->has_search_latch); + + ut_a(trx->dict_operation_lock_mode == 0); + + if (trx->lock_heap) { + mem_heap_free(trx->lock_heap); + } + + if (trx->global_read_view_heap) { + mem_heap_free(trx->global_read_view_heap); + } + + ut_a(ib_vector_is_empty(trx->autoinc_locks)); + ib_vector_free(trx->autoinc_locks); + + UT_LIST_REMOVE(trx_list, trx_sys->trx_list, trx); + + mem_free(trx); +} + +/********************************************************************//** Frees a transaction object for MySQL. */ UNIV_INTERN void @@ -495,6 +557,7 @@ trx_lists_init_at_db_start(void) if (srv_force_recovery == 0) { trx->conc_state = TRX_PREPARED; + trx_n_prepared++; } else { fprintf(stderr, "InnoDB: Since" @@ -573,6 +636,7 @@ trx_lists_init_at_db_start(void) trx->conc_state = TRX_PREPARED; + trx_n_prepared++; } else { fprintf(stderr, "InnoDB: Since" @@ -878,6 +942,11 @@ trx_commit_off_kernel( || trx->conc_state == TRX_PREPARED); ut_ad(mutex_own(&kernel_mutex)); + if (UNIV_UNLIKELY(trx->conc_state == TRX_PREPARED)) { + ut_a(trx_n_prepared > 0); + trx_n_prepared--; + } + /* The following assignment makes the transaction committed in memory and makes its changes to data visible to other transactions. NOTE that there is a small discrepancy from the strict formal @@ -1945,6 +2014,7 @@ trx_prepare_off_kernel( /*--------------------------------------*/ trx->conc_state = TRX_PREPARED; + trx_n_prepared++; /*--------------------------------------*/ if (lsn) { @@ -2127,10 +2197,11 @@ trx_get_trx_by_xid( while (trx) { /* Compare two X/Open XA transaction id's: their length should be the same and binary comparison - of gtrid_lenght+bqual_length bytes should be + of gtrid_length+bqual_length bytes should be the same */ - if (trx->conc_state == TRX_PREPARED + if (trx->is_recovered + && trx->conc_state == TRX_PREPARED && xid->gtrid_length == trx->xid.gtrid_length && xid->bqual_length == trx->xid.bqual_length && memcmp(xid->data, trx->xid.data, diff --git a/storage/xtradb/trx/trx0undo.c b/storage/xtradb/trx/trx0undo.c index 9ed83b5d5c1..ec1cd2d2c43 100644 --- a/storage/xtradb/trx/trx0undo.c +++ b/storage/xtradb/trx/trx0undo.c @@ -36,6 +36,7 @@ Created 3/26/1996 Heikki Tuuri #include "trx0rseg.h" #include "trx0trx.h" #include "srv0srv.h" +#include "srv0start.h" #include "trx0rec.h" #include "trx0purge.h" @@ -2014,4 +2015,28 @@ trx_undo_insert_cleanup( mutex_exit(&(rseg->mutex)); } + +/********************************************************************//** +At shutdown, frees the undo logs of a PREPARED transaction. */ +UNIV_INTERN +void +trx_undo_free_prepared( +/*===================*/ + trx_t* trx) /*!< in/out: PREPARED transaction */ +{ + ut_ad(srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS); + + if (trx->update_undo) { + ut_a(trx->update_undo->state == TRX_UNDO_PREPARED); + UT_LIST_REMOVE(undo_list, trx->rseg->update_undo_list, + trx->update_undo); + trx_undo_mem_free(trx->update_undo); + } + if (trx->insert_undo) { + ut_a(trx->insert_undo->state == TRX_UNDO_PREPARED); + UT_LIST_REMOVE(undo_list, trx->rseg->insert_undo_list, + trx->insert_undo); + trx_undo_mem_free(trx->insert_undo); + } +} #endif /* !UNIV_HOTBACKUP */ diff --git a/storage/xtradb/ut/ut0mem.c b/storage/xtradb/ut/ut0mem.c index bf55e4273b6..95fb2187b79 100644 --- a/storage/xtradb/ut/ut0mem.c +++ b/storage/xtradb/ut/ut0mem.c @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -489,53 +489,6 @@ ut_strlcpy_rev( return(src_size); } -/**********************************************************************//** -Make a quoted copy of a NUL-terminated string. Leading and trailing -quotes will not be included; only embedded quotes will be escaped. -See also ut_strlenq() and ut_memcpyq(). -@return pointer to end of dest */ -UNIV_INTERN -char* -ut_strcpyq( -/*=======*/ - char* dest, /*!< in: output buffer */ - char q, /*!< in: the quote character */ - const char* src) /*!< in: null-terminated string */ -{ - while (*src) { - if ((*dest++ = *src++) == q) { - *dest++ = q; - } - } - - return(dest); -} - -/**********************************************************************//** -Make a quoted copy of a fixed-length string. Leading and trailing -quotes will not be included; only embedded quotes will be escaped. -See also ut_strlenq() and ut_strcpyq(). -@return pointer to end of dest */ -UNIV_INTERN -char* -ut_memcpyq( -/*=======*/ - char* dest, /*!< in: output buffer */ - char q, /*!< in: the quote character */ - const char* src, /*!< in: string to be quoted */ - ulint len) /*!< in: length of src */ -{ - const char* srcend = src + len; - - while (src < srcend) { - if ((*dest++ = *src++) == q) { - *dest++ = q; - } - } - - return(dest); -} - #ifndef UNIV_HOTBACKUP /**********************************************************************//** Return the number of times s2 occurs in s1. Overlapping instances of s2 |