summaryrefslogtreecommitdiff
path: root/storage
diff options
context:
space:
mode:
authorMikael Ronstrom <mikael@dator8>2011-03-04 12:35:24 +0100
committerMikael Ronstrom <mikael@dator8>2011-03-04 12:35:24 +0100
commit0fc7078e53eb2b34503ddd2408844c2fa06d393f (patch)
treebccbd7ab21bb499f9f924e6f90e9303f3018d1ae /storage
parent23296fdc18e864b1b016b4c6a47723b26480f9c6 (diff)
parenta4711099716c53490a69c3c17a8b61a4a3282b19 (diff)
downloadmariadb-git-0fc7078e53eb2b34503ddd2408844c2fa06d393f.tar.gz
merge
Diffstat (limited to 'storage')
-rw-r--r--storage/innobase/CMakeLists.txt4
-rw-r--r--storage/innobase/btr/btr0btr.c572
-rw-r--r--storage/innobase/btr/btr0cur.c44
-rw-r--r--storage/innobase/buf/buf0rea.c2
-rw-r--r--storage/innobase/dict/dict0load.c61
-rw-r--r--storage/innobase/dict/dict0mem.c9
-rw-r--r--storage/innobase/handler/ha_innodb.cc18
-rw-r--r--storage/innobase/handler/handler0alter.cc12
-rw-r--r--storage/innobase/include/btr0btr.h85
-rw-r--r--storage/innobase/include/btr0types.h125
-rw-r--r--storage/innobase/include/buf0flu.h13
-rw-r--r--storage/innobase/include/dict0dict.h12
-rw-r--r--storage/innobase/include/dict0dict.ic30
-rw-r--r--storage/innobase/include/dict0load.h5
-rw-r--r--storage/innobase/include/dict0mem.h11
-rw-r--r--storage/innobase/include/dict0types.h14
-rw-r--r--storage/innobase/include/page0zip.h2
-rw-r--r--storage/innobase/include/srv0srv.h7
-rw-r--r--storage/innobase/include/sync0sync.h6
-rw-r--r--storage/innobase/include/trx0purge.h22
-rw-r--r--storage/innobase/include/trx0rseg.h12
-rw-r--r--storage/innobase/include/trx0sys.h10
-rw-r--r--storage/innobase/include/trx0sys.ic12
-rw-r--r--storage/innobase/include/univ.i4
-rw-r--r--storage/innobase/include/ut0bh.h152
-rw-r--r--storage/innobase/include/ut0bh.ic125
-rw-r--r--storage/innobase/page/page0cur.c10
-rw-r--r--storage/innobase/page/page0page.c11
-rw-r--r--storage/innobase/page/page0zip.c5
-rw-r--r--storage/innobase/row/row0mysql.c6
-rw-r--r--storage/innobase/row/row0upd.c35
-rw-r--r--storage/innobase/srv/srv0srv.c22
-rw-r--r--storage/innobase/srv/srv0start.c6
-rw-r--r--storage/innobase/sync/sync0rw.c3
-rw-r--r--storage/innobase/sync/sync0sync.c14
-rw-r--r--storage/innobase/trx/trx0purge.c399
-rw-r--r--storage/innobase/trx/trx0rseg.c52
-rw-r--r--storage/innobase/trx/trx0sys.c43
-rw-r--r--storage/innobase/trx/trx0trx.c256
-rw-r--r--storage/innobase/ut/ut0bh.c164
-rw-r--r--storage/innobase/ut/ut0ut.c9
-rw-r--r--storage/myisam/mi_create.c16
42 files changed, 2007 insertions, 413 deletions
diff --git a/storage/innobase/CMakeLists.txt b/storage/innobase/CMakeLists.txt
index 4bbda0d2477..d4b98f2af0d 100644
--- a/storage/innobase/CMakeLists.txt
+++ b/storage/innobase/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2006, 2011, Oracle and/or its affiliates. All rights reserved.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -246,7 +246,7 @@ SET(INNOBASE_SOURCES btr/btr0btr.c btr/btr0cur.c btr/btr0pcur.c btr/btr0sea.c
trx/trx0sys.c trx/trx0trx.c trx/trx0undo.c
usr/usr0sess.c
ut/ut0byte.c ut/ut0dbg.c ut/ut0list.c ut/ut0mem.c ut/ut0rbt.c ut/ut0rnd.c
- ut/ut0ut.c ut/ut0vec.c ut/ut0wqueue.c)
+ ut/ut0ut.c ut/ut0vec.c ut/ut0wqueue.c ut/ut0bh.c)
IF(WITH_INNODB)
# Legacy option
diff --git a/storage/innobase/btr/btr0btr.c b/storage/innobase/btr/btr0btr.c
index 2e544c0552a..fb2509a62ff 100644
--- a/storage/innobase/btr/btr0btr.c
+++ b/storage/innobase/btr/btr0btr.c
@@ -42,6 +42,560 @@ Created 6/2/1994 Heikki Tuuri
#include "ibuf0ibuf.h"
#include "trx0trx.h"
+#ifdef UNIV_BLOB_DEBUG
+# include "srv0srv.h"
+# include "ut0rbt.h"
+
+/** TRUE when messages about index->blobs modification are enabled. */
+static ibool btr_blob_dbg_msg;
+
+/** Issue a message about an operation on index->blobs.
+@param op operation
+@param b the entry being subjected to the operation
+@param ctx the context of the operation */
+#define btr_blob_dbg_msg_issue(op, b, ctx) \
+ fprintf(stderr, op " %u:%u:%u->%u %s(%u,%u,%u)\n", \
+ (b)->ref_page_no, (b)->ref_heap_no, \
+ (b)->ref_field_no, (b)->blob_page_no, ctx, \
+ (b)->owner, (b)->always_owner, (b)->del)
+
+/** Insert to index->blobs a reference to an off-page column.
+@param index the index tree
+@param b the reference
+@param ctx context (for logging) */
+UNIV_INTERN
+void
+btr_blob_dbg_rbt_insert(
+/*====================*/
+ dict_index_t* index, /*!< in/out: index tree */
+ const btr_blob_dbg_t* b, /*!< in: the reference */
+ const char* ctx) /*!< in: context (for logging) */
+{
+ if (btr_blob_dbg_msg) {
+ btr_blob_dbg_msg_issue("insert", b, ctx);
+ }
+ mutex_enter(&index->blobs_mutex);
+ rbt_insert(index->blobs, b, b);
+ mutex_exit(&index->blobs_mutex);
+}
+
+/** Remove from index->blobs a reference to an off-page column.
+@param index the index tree
+@param b the reference
+@param ctx context (for logging) */
+UNIV_INTERN
+void
+btr_blob_dbg_rbt_delete(
+/*====================*/
+ dict_index_t* index, /*!< in/out: index tree */
+ const btr_blob_dbg_t* b, /*!< in: the reference */
+ const char* ctx) /*!< in: context (for logging) */
+{
+ if (btr_blob_dbg_msg) {
+ btr_blob_dbg_msg_issue("delete", b, ctx);
+ }
+ mutex_enter(&index->blobs_mutex);
+ ut_a(rbt_delete(index->blobs, b));
+ mutex_exit(&index->blobs_mutex);
+}
+
+/**************************************************************//**
+Comparator for items (btr_blob_dbg_t) in index->blobs.
+The key in index->blobs is (ref_page_no, ref_heap_no, ref_field_no).
+@return negative, 0 or positive if *a<*b, *a=*b, *a>*b */
+static
+int
+btr_blob_dbg_cmp(
+/*=============*/
+ const void* a, /*!< in: first btr_blob_dbg_t to compare */
+ const void* b) /*!< in: second btr_blob_dbg_t to compare */
+{
+ const btr_blob_dbg_t* aa = a;
+ const btr_blob_dbg_t* bb = b;
+
+ ut_ad(aa != NULL);
+ ut_ad(bb != NULL);
+
+ if (aa->ref_page_no != bb->ref_page_no) {
+ return(aa->ref_page_no < bb->ref_page_no ? -1 : 1);
+ }
+ if (aa->ref_heap_no != bb->ref_heap_no) {
+ return(aa->ref_heap_no < bb->ref_heap_no ? -1 : 1);
+ }
+ if (aa->ref_field_no != bb->ref_field_no) {
+ return(aa->ref_field_no < bb->ref_field_no ? -1 : 1);
+ }
+ return(0);
+}
+
+/**************************************************************//**
+Add a reference to an off-page column to the index->blobs map. */
+UNIV_INTERN
+void
+btr_blob_dbg_add_blob(
+/*==================*/
+ const rec_t* rec, /*!< in: clustered index record */
+ ulint field_no, /*!< in: off-page column number */
+ ulint page_no, /*!< in: start page of the column */
+ dict_index_t* index, /*!< in/out: index tree */
+ const char* ctx) /*!< in: context (for logging) */
+{
+ btr_blob_dbg_t b;
+ const page_t* page = page_align(rec);
+
+ ut_a(index->blobs);
+
+ b.blob_page_no = page_no;
+ b.ref_page_no = page_get_page_no(page);
+ b.ref_heap_no = page_rec_get_heap_no(rec);
+ b.ref_field_no = field_no;
+ ut_a(b.ref_field_no >= index->n_uniq);
+ b.always_owner = b.owner = TRUE;
+ b.del = FALSE;
+ ut_a(!rec_get_deleted_flag(rec, page_is_comp(page)));
+ btr_blob_dbg_rbt_insert(index, &b, ctx);
+}
+
+/**************************************************************//**
+Add to index->blobs any references to off-page columns from a record.
+@return number of references added */
+UNIV_INTERN
+ulint
+btr_blob_dbg_add_rec(
+/*=================*/
+ const rec_t* rec, /*!< in: record */
+ dict_index_t* index, /*!< in/out: index */
+ const ulint* offsets,/*!< in: offsets */
+ const char* ctx) /*!< in: context (for logging) */
+{
+ ulint count = 0;
+ ulint i;
+ btr_blob_dbg_t b;
+ ibool del;
+
+ ut_ad(rec_offs_validate(rec, index, offsets));
+
+ if (!rec_offs_any_extern(offsets)) {
+ return(0);
+ }
+
+ b.ref_page_no = page_get_page_no(page_align(rec));
+ b.ref_heap_no = page_rec_get_heap_no(rec);
+ del = (rec_get_deleted_flag(rec, rec_offs_comp(offsets)) != 0);
+
+ for (i = 0; i < rec_offs_n_fields(offsets); i++) {
+ if (rec_offs_nth_extern(offsets, i)) {
+ ulint len;
+ const byte* field_ref = rec_get_nth_field(
+ rec, offsets, i, &len);
+
+ ut_a(len != UNIV_SQL_NULL);
+ ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
+ field_ref += len - BTR_EXTERN_FIELD_REF_SIZE;
+
+ if (!memcmp(field_ref, field_ref_zero,
+ BTR_EXTERN_FIELD_REF_SIZE)) {
+ /* the column has not been stored yet */
+ continue;
+ }
+
+ b.ref_field_no = i;
+ b.blob_page_no = mach_read_from_4(
+ field_ref + BTR_EXTERN_PAGE_NO);
+ ut_a(b.ref_field_no >= index->n_uniq);
+ b.always_owner = b.owner
+ = !(field_ref[BTR_EXTERN_LEN]
+ & BTR_EXTERN_OWNER_FLAG);
+ b.del = del;
+
+ btr_blob_dbg_rbt_insert(index, &b, ctx);
+ count++;
+ }
+ }
+
+ return(count);
+}
+
+/**************************************************************//**
+Display the references to off-page columns.
+This function is to be called from a debugger,
+for example when a breakpoint on ut_dbg_assertion_failed is hit. */
+UNIV_INTERN
+void
+btr_blob_dbg_print(
+/*===============*/
+ const dict_index_t* index) /*!< in: index tree */
+{
+ const ib_rbt_node_t* node;
+
+ if (!index->blobs) {
+ return;
+ }
+
+ /* We intentionally do not acquire index->blobs_mutex here.
+ This function is to be called from a debugger, and the caller
+ should make sure that the index->blobs_mutex is held. */
+
+ for (node = rbt_first(index->blobs);
+ node != NULL; node = rbt_next(index->blobs, node)) {
+ const btr_blob_dbg_t* b
+ = rbt_value(btr_blob_dbg_t, node);
+ fprintf(stderr, "%u:%u:%u->%u%s%s%s\n",
+ b->ref_page_no, b->ref_heap_no, b->ref_field_no,
+ b->blob_page_no,
+ b->owner ? "" : "(disowned)",
+ b->always_owner ? "" : "(has disowned)",
+ b->del ? "(deleted)" : "");
+ }
+}
+
+/**************************************************************//**
+Remove from index->blobs any references to off-page columns from a record.
+@return number of references removed */
+UNIV_INTERN
+ulint
+btr_blob_dbg_remove_rec(
+/*====================*/
+ const rec_t* rec, /*!< in: record */
+ dict_index_t* index, /*!< in/out: index */
+ const ulint* offsets,/*!< in: offsets */
+ const char* ctx) /*!< in: context (for logging) */
+{
+ ulint i;
+ ulint count = 0;
+ btr_blob_dbg_t b;
+
+ ut_ad(rec_offs_validate(rec, index, offsets));
+
+ if (!rec_offs_any_extern(offsets)) {
+ return(0);
+ }
+
+ b.ref_page_no = page_get_page_no(page_align(rec));
+ b.ref_heap_no = page_rec_get_heap_no(rec);
+
+ for (i = 0; i < rec_offs_n_fields(offsets); i++) {
+ if (rec_offs_nth_extern(offsets, i)) {
+ ulint len;
+ const byte* field_ref = rec_get_nth_field(
+ rec, offsets, i, &len);
+
+ ut_a(len != UNIV_SQL_NULL);
+ ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
+ field_ref += len - BTR_EXTERN_FIELD_REF_SIZE;
+
+ b.ref_field_no = i;
+ b.blob_page_no = mach_read_from_4(
+ field_ref + BTR_EXTERN_PAGE_NO);
+
+ switch (b.blob_page_no) {
+ case 0:
+ /* The column has not been stored yet.
+ The BLOB pointer must be all zero.
+ There cannot be a BLOB starting at
+ page 0, because page 0 is reserved for
+ the tablespace header. */
+ ut_a(!memcmp(field_ref, field_ref_zero,
+ BTR_EXTERN_FIELD_REF_SIZE));
+ /* fall through */
+ case FIL_NULL:
+ /* the column has been freed already */
+ continue;
+ }
+
+ btr_blob_dbg_rbt_delete(index, &b, ctx);
+ count++;
+ }
+ }
+
+ return(count);
+}
+
+/**************************************************************//**
+Check that there are no references to off-page columns from or to
+the given page. Invoked when freeing or clearing a page.
+@return TRUE when no orphan references exist */
+UNIV_INTERN
+ibool
+btr_blob_dbg_is_empty(
+/*==================*/
+ dict_index_t* index, /*!< in: index */
+ ulint page_no) /*!< in: page number */
+{
+ const ib_rbt_node_t* node;
+ ibool success = TRUE;
+
+ if (!index->blobs) {
+ return(success);
+ }
+
+ mutex_enter(&index->blobs_mutex);
+
+ for (node = rbt_first(index->blobs);
+ node != NULL; node = rbt_next(index->blobs, node)) {
+ const btr_blob_dbg_t* b
+ = rbt_value(btr_blob_dbg_t, node);
+
+ if (b->ref_page_no != page_no && b->blob_page_no != page_no) {
+ continue;
+ }
+
+ fprintf(stderr,
+ "InnoDB: orphan BLOB ref%s%s%s %u:%u:%u->%u\n",
+ b->owner ? "" : "(disowned)",
+ b->always_owner ? "" : "(has disowned)",
+ b->del ? "(deleted)" : "",
+ b->ref_page_no, b->ref_heap_no, b->ref_field_no,
+ b->blob_page_no);
+
+ if (b->blob_page_no != page_no || b->owner || !b->del) {
+ success = FALSE;
+ }
+ }
+
+ mutex_exit(&index->blobs_mutex);
+ return(success);
+}
+
+/**************************************************************//**
+Count and process all references to off-page columns on a page.
+@return number of references processed */
+UNIV_INTERN
+ulint
+btr_blob_dbg_op(
+/*============*/
+ const page_t* page, /*!< in: B-tree leaf page */
+ const rec_t* rec, /*!< in: record to start from
+ (NULL to process the whole page) */
+ dict_index_t* index, /*!< in/out: index */
+ const char* ctx, /*!< in: context (for logging) */
+ const btr_blob_dbg_op_f op) /*!< in: operation on records */
+{
+ ulint count = 0;
+ mem_heap_t* heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ ulint* offsets = offsets_;
+ rec_offs_init(offsets_);
+
+ ut_a(fil_page_get_type(page) == FIL_PAGE_INDEX);
+ ut_a(!rec || page_align(rec) == page);
+
+ if (!index->blobs || !page_is_leaf(page)
+ || !dict_index_is_clust(index)) {
+ return(0);
+ }
+
+ if (rec == NULL) {
+ rec = page_get_infimum_rec(page);
+ }
+
+ do {
+ offsets = rec_get_offsets(rec, index, offsets,
+ ULINT_UNDEFINED, &heap);
+ count += op(rec, index, offsets, ctx);
+ rec = page_rec_get_next_const(rec);
+ } while (!page_rec_is_supremum(rec));
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+
+ return(count);
+}
+
+/**************************************************************//**
+Count and add to index->blobs any references to off-page columns
+from records on a page.
+@return number of references added */
+UNIV_INTERN
+ulint
+btr_blob_dbg_add(
+/*=============*/
+ const page_t* page, /*!< in: rewritten page */
+ dict_index_t* index, /*!< in/out: index */
+ const char* ctx) /*!< in: context (for logging) */
+{
+ btr_blob_dbg_assert_empty(index, page_get_page_no(page));
+
+ return(btr_blob_dbg_op(page, NULL, index, ctx, btr_blob_dbg_add_rec));
+}
+
+/**************************************************************//**
+Count and remove from index->blobs any references to off-page columns
+from records on a page.
+Used when reorganizing a page, before copying the records.
+@return number of references removed */
+UNIV_INTERN
+ulint
+btr_blob_dbg_remove(
+/*================*/
+ const page_t* page, /*!< in: b-tree page */
+ dict_index_t* index, /*!< in/out: index */
+ const char* ctx) /*!< in: context (for logging) */
+{
+ ulint count;
+
+ count = btr_blob_dbg_op(page, NULL, index, ctx,
+ btr_blob_dbg_remove_rec);
+
+ /* Check that no references exist. */
+ btr_blob_dbg_assert_empty(index, page_get_page_no(page));
+
+ return(count);
+}
+
+/**************************************************************//**
+Restore in index->blobs any references to off-page columns
+Used when page reorganize fails due to compressed page overflow. */
+UNIV_INTERN
+void
+btr_blob_dbg_restore(
+/*=================*/
+ const page_t* npage, /*!< in: page that failed to compress */
+ const page_t* page, /*!< in: copy of original page */
+ dict_index_t* index, /*!< in/out: index */
+ const char* ctx) /*!< in: context (for logging) */
+{
+ ulint removed;
+ ulint added;
+
+ ut_a(page_get_page_no(npage) == page_get_page_no(page));
+ ut_a(page_get_space_id(npage) == page_get_space_id(page));
+
+ removed = btr_blob_dbg_remove(npage, index, ctx);
+ added = btr_blob_dbg_add(page, index, ctx);
+ ut_a(added == removed);
+}
+
+/**************************************************************//**
+Modify the 'deleted' flag of a record. */
+UNIV_INTERN
+void
+btr_blob_dbg_set_deleted_flag(
+/*==========================*/
+ const rec_t* rec, /*!< in: record */
+ dict_index_t* index, /*!< in/out: index */
+ const ulint* offsets,/*!< in: rec_get_offs(rec, index) */
+ ibool del) /*!< in: TRUE=deleted, FALSE=exists */
+{
+ const ib_rbt_node_t* node;
+ btr_blob_dbg_t b;
+ btr_blob_dbg_t* c;
+ ulint i;
+
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ ut_a(dict_index_is_clust(index));
+ ut_a(del == !!del);/* must be FALSE==0 or TRUE==1 */
+
+ if (!rec_offs_any_extern(offsets) || !index->blobs) {
+
+ return;
+ }
+
+ b.ref_page_no = page_get_page_no(page_align(rec));
+ b.ref_heap_no = page_rec_get_heap_no(rec);
+
+ for (i = 0; i < rec_offs_n_fields(offsets); i++) {
+ if (rec_offs_nth_extern(offsets, i)) {
+ ulint len;
+ const byte* field_ref = rec_get_nth_field(
+ rec, offsets, i, &len);
+
+ ut_a(len != UNIV_SQL_NULL);
+ ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
+ field_ref += len - BTR_EXTERN_FIELD_REF_SIZE;
+
+ b.ref_field_no = i;
+ b.blob_page_no = mach_read_from_4(
+ field_ref + BTR_EXTERN_PAGE_NO);
+
+ switch (b.blob_page_no) {
+ case 0:
+ ut_a(memcmp(field_ref, field_ref_zero,
+ BTR_EXTERN_FIELD_REF_SIZE));
+ /* page number 0 is for the
+ page allocation bitmap */
+ case FIL_NULL:
+ /* the column has been freed already */
+ ut_error;
+ }
+
+ mutex_enter(&index->blobs_mutex);
+ node = rbt_lookup(index->blobs, &b);
+ ut_a(node);
+
+ c = rbt_value(btr_blob_dbg_t, node);
+ /* The flag should be modified. */
+ c->del = del;
+ if (btr_blob_dbg_msg) {
+ b = *c;
+ mutex_exit(&index->blobs_mutex);
+ btr_blob_dbg_msg_issue("del_mk", &b, "");
+ } else {
+ mutex_exit(&index->blobs_mutex);
+ }
+ }
+ }
+}
+
+/**************************************************************//**
+Change the ownership of an off-page column. */
+UNIV_INTERN
+void
+btr_blob_dbg_owner(
+/*===============*/
+ const rec_t* rec, /*!< in: record */
+ dict_index_t* index, /*!< in/out: index */
+ const ulint* offsets,/*!< in: rec_get_offs(rec, index) */
+ ulint i, /*!< in: ith field in rec */
+ ibool own) /*!< in: TRUE=owned, FALSE=disowned */
+{
+ const ib_rbt_node_t* node;
+ btr_blob_dbg_t b;
+ const byte* field_ref;
+ ulint len;
+
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ ut_a(rec_offs_nth_extern(offsets, i));
+
+ field_ref = rec_get_nth_field(rec, offsets, i, &len);
+ ut_a(len != UNIV_SQL_NULL);
+ ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
+ field_ref += len - BTR_EXTERN_FIELD_REF_SIZE;
+
+ b.ref_page_no = page_get_page_no(page_align(rec));
+ b.ref_heap_no = page_rec_get_heap_no(rec);
+ b.ref_field_no = i;
+ b.owner = !(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG);
+ b.blob_page_no = mach_read_from_4(field_ref + BTR_EXTERN_PAGE_NO);
+
+ ut_a(b.owner == own);
+
+ mutex_enter(&index->blobs_mutex);
+ node = rbt_lookup(index->blobs, &b);
+ /* row_ins_clust_index_entry_by_modify() invokes
+ btr_cur_unmark_extern_fields() also for the newly inserted
+ references, which are all zero bytes until the columns are stored.
+ The node lookup must fail if and only if that is the case. */
+ ut_a(!memcmp(field_ref, field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE)
+ == !node);
+
+ if (node) {
+ btr_blob_dbg_t* c = rbt_value(btr_blob_dbg_t, node);
+ /* Some code sets ownership from TRUE to TRUE.
+ We do not allow changing ownership from FALSE to FALSE. */
+ ut_a(own || c->owner);
+
+ c->owner = own;
+ if (!own) {
+ c->always_owner = FALSE;
+ }
+ }
+
+ mutex_exit(&index->blobs_mutex);
+}
+#endif /* UNIV_BLOB_DEBUG */
+
/*
Latching strategy of the InnoDB B-tree
--------------------------------------
@@ -296,6 +850,7 @@ btr_page_create(
page_t* page = buf_block_get_frame(block);
ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+ btr_blob_dbg_assert_empty(index, buf_block_get_page_no(block));
if (UNIV_LIKELY_NULL(page_zip)) {
page_create_zip(block, index, level, mtr);
@@ -489,6 +1044,7 @@ btr_page_free_low(
modify clock */
buf_block_modify_clock_inc(block);
+ btr_blob_dbg_assert_empty(index, buf_block_get_page_no(block));
if (dict_index_is_ibuf(index)) {
@@ -774,6 +1330,14 @@ btr_create(
block = buf_page_get(space, zip_size, page_no,
RW_X_LATCH, mtr);
} else {
+#ifdef UNIV_BLOB_DEBUG
+ if ((type & DICT_CLUSTERED) && !index->blobs) {
+ mutex_create(PFS_NOT_INSTRUMENTED,
+ &index->blobs_mutex, SYNC_ANY_LATCH);
+ index->blobs = rbt_create(sizeof(btr_blob_dbg_t),
+ btr_blob_dbg_cmp);
+ }
+#endif /* UNIV_BLOB_DEBUG */
block = fseg_create(space, 0,
PAGE_HEADER + PAGE_BTR_SEG_TOP, mtr);
}
@@ -998,6 +1562,7 @@ btr_page_reorganize_low(
block->check_index_page_at_flush = TRUE;
#endif /* !UNIV_HOTBACKUP */
+ btr_blob_dbg_remove(page, index, "btr_page_reorganize");
/* Recreate the page: note that global data on page (possible
segment headers, next page-field, etc.) is preserved intact */
@@ -1026,6 +1591,8 @@ btr_page_reorganize_low(
(!page_zip_compress(page_zip, page, index, NULL))) {
/* Restore the old page and exit. */
+ btr_blob_dbg_restore(page, temp_page, index,
+ "btr_page_reorganize_compress_fail");
#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
/* Check that the bytes that we skip are identical. */
@@ -1159,6 +1726,7 @@ btr_page_empty(
#endif /* UNIV_ZIP_DEBUG */
btr_search_drop_page_hash_index(block);
+ btr_blob_dbg_remove(page, index, "btr_page_empty");
/* Recreate the page: note that global data on page (possible
segment headers, next page-field, etc.) is preserved intact */
@@ -2499,6 +3067,7 @@ btr_lift_page_up(
index);
}
+ btr_blob_dbg_remove(page, index, "btr_lift_page_up");
lock_update_copy_and_discard(father_block, block);
/* Go upward to root page, decrementing levels by one. */
@@ -2760,6 +3329,7 @@ err_exit:
lock_update_merge_right(merge_block, orig_succ, block);
}
+ btr_blob_dbg_remove(page, index, "btr_compress");
mem_heap_free(heap);
if (!dict_index_is_clust(index) && page_is_leaf(merge_page)) {
@@ -2990,6 +3560,8 @@ btr_discard_page(
block);
}
+ btr_blob_dbg_remove(page, index, "btr_discard_page");
+
/* Free the file page */
btr_page_free(index, block, mtr);
diff --git a/storage/innobase/btr/btr0cur.c b/storage/innobase/btr/btr0cur.c
index 8ba9fd721f5..e182840b1a0 100644
--- a/storage/innobase/btr/btr0cur.c
+++ b/storage/innobase/btr/btr0cur.c
@@ -2690,6 +2690,7 @@ btr_cur_del_mark_set_clust_rec(
page_zip = buf_block_get_page_zip(block);
+ btr_blob_dbg_set_deleted_flag(rec, index, offsets, val);
btr_rec_set_deleted_flag(rec, page_zip, val);
trx = thr_get_trx(thr);
@@ -3873,6 +3874,8 @@ btr_cur_set_ownership_of_extern_field(
} else {
mach_write_to_1(data + local_len + BTR_EXTERN_LEN, byte_val);
}
+
+ btr_blob_dbg_owner(rec, index, offsets, i, val);
}
/*******************************************************************//**
@@ -4373,6 +4376,11 @@ btr_store_big_rec_extern_fields_func(
}
if (prev_page_no == FIL_NULL) {
+ btr_blob_dbg_add_blob(
+ rec, big_rec_vec->fields[i]
+ .field_no, page_no, index,
+ "store");
+
mach_write_to_4(field_ref
+ BTR_EXTERN_SPACE_ID,
space_id);
@@ -4448,6 +4456,11 @@ next_zip_page:
MLOG_4BYTES, &mtr);
if (prev_page_no == FIL_NULL) {
+ btr_blob_dbg_add_blob(
+ rec, big_rec_vec->fields[i]
+ .field_no, page_no, index,
+ "store");
+
mlog_write_ulint(field_ref
+ BTR_EXTERN_SPACE_ID,
space_id,
@@ -4616,6 +4629,37 @@ btr_free_externally_stored_field(
rec_zip_size = 0;
}
+#ifdef UNIV_BLOB_DEBUG
+ if (!(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG)
+ && !((field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_INHERITED_FLAG)
+ && (rb_ctx == RB_NORMAL || rb_ctx == RB_RECOVERY))) {
+ /* This off-page column will be freed.
+ Check that no references remain. */
+
+ btr_blob_dbg_t b;
+
+ b.blob_page_no = mach_read_from_4(
+ field_ref + BTR_EXTERN_PAGE_NO);
+
+ if (rec) {
+ /* Remove the reference from the record to the
+ BLOB. If the BLOB were not freed, the
+ reference would be removed when the record is
+ removed. Freeing the BLOB will overwrite the
+ BTR_EXTERN_PAGE_NO in the field_ref of the
+ record with FIL_NULL, which would make the
+ btr_blob_dbg information inconsistent with the
+ record. */
+ b.ref_page_no = page_get_page_no(page_align(rec));
+ b.ref_heap_no = page_rec_get_heap_no(rec);
+ b.ref_field_no = i;
+ btr_blob_dbg_rbt_delete(index, &b, "free");
+ }
+
+ btr_blob_dbg_assert_empty(index, b.blob_page_no);
+ }
+#endif /* UNIV_BLOB_DEBUG */
+
for (;;) {
#ifdef UNIV_SYNC_DEBUG
buf_block_t* rec_block;
diff --git a/storage/innobase/buf/buf0rea.c b/storage/innobase/buf/buf0rea.c
index 519724d0b9a..3e36b707585 100644
--- a/storage/innobase/buf/buf0rea.c
+++ b/storage/innobase/buf/buf0rea.c
@@ -531,7 +531,7 @@ buf_read_ibuf_merge_pages(
buf_pool_t* buf_pool;
ulint zip_size = fil_space_get_zip_size(space_ids[i]);
- buf_pool = buf_pool_get(space_ids[i], space_versions[i]);
+ buf_pool = buf_pool_get(space_ids[i], page_nos[i]);
while (buf_pool->n_pend_reads
> buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) {
diff --git a/storage/innobase/dict/dict0load.c b/storage/innobase/dict/dict0load.c
index 3561e7220ab..14490980bb6 100644
--- a/storage/innobase/dict/dict0load.c
+++ b/storage/innobase/dict/dict0load.c
@@ -1323,7 +1323,10 @@ ulint
dict_load_indexes(
/*==============*/
dict_table_t* table, /*!< in/out: table */
- mem_heap_t* heap) /*!< in: memory heap for temporary storage */
+ mem_heap_t* heap, /*!< in: memory heap for temporary storage */
+ dict_err_ignore_t ignore_err)
+ /*!< in: error to be ignored when
+ loading the index definition */
{
dict_table_t* sys_indexes;
dict_index_t* sys_index;
@@ -1406,10 +1409,22 @@ dict_load_indexes(
"InnoDB: but the index tree has been freed!\n",
index->name, table->name);
+ if (ignore_err & DICT_ERR_IGNORE_INDEX_ROOT) {
+ /* If caller can tolerate this error,
+ we will continue to load the index and
+ let caller deal with this error. However
+ mark the index and table corrupted */
+ index->corrupted = TRUE;
+ table->corrupted = TRUE;
+ fprintf(stderr,
+ "InnoDB: Index is corrupt but forcing"
+ " load into data dictionary\n");
+ } else {
corrupted:
- dict_mem_index_free(index);
- error = DB_CORRUPTION;
- goto func_exit;
+ dict_mem_index_free(index);
+ error = DB_CORRUPTION;
+ goto func_exit;
+ }
} else if (!dict_index_is_clust(index)
&& NULL == dict_table_get_first_index(table)) {
@@ -1618,7 +1633,10 @@ dict_load_table(
/*============*/
const char* name, /*!< in: table name in the
databasename/tablename format */
- ibool cached) /*!< in: TRUE=add to cache, FALSE=do not */
+ ibool cached, /*!< in: TRUE=add to cache, FALSE=do not */
+ dict_err_ignore_t ignore_err)
+ /*!< in: error to be ignored when loading
+ table and its indexes' definition */
{
dict_table_t* table;
dict_table_t* sys_tables;
@@ -1733,7 +1751,7 @@ err_exit:
mem_heap_empty(heap);
- err = dict_load_indexes(table, heap);
+ err = dict_load_indexes(table, heap, ignore_err);
/* Initialize table foreign_child value. Its value could be
changed when dict_load_foreigns() is called below */
@@ -1810,6 +1828,8 @@ dict_load_table_on_id(
ut_ad(mutex_own(&(dict_sys->mutex)));
+ table = NULL;
+
/* NOTE that the operation of this function is protected by
the dictionary mutex, and therefore no deadlocks can occur
with other dictionary operations. */
@@ -1836,15 +1856,17 @@ dict_load_table_on_id(
BTR_SEARCH_LEAF, &pcur, &mtr);
rec = btr_pcur_get_rec(&pcur);
- if (!btr_pcur_is_on_user_rec(&pcur)
- || rec_get_deleted_flag(rec, 0)) {
+ if (!btr_pcur_is_on_user_rec(&pcur)) {
/* Not found */
+ goto func_exit;
+ }
- btr_pcur_close(&pcur);
- mtr_commit(&mtr);
- mem_heap_free(heap);
-
- return(NULL);
+ /* Find the first record that is not delete marked */
+ while (rec_get_deleted_flag(rec, 0)) {
+ if (!btr_pcur_move_to_next_user_rec(&pcur, &mtr)) {
+ goto func_exit;
+ }
+ rec = btr_pcur_get_rec(&pcur);
}
/*---------------------------------------------------*/
@@ -1857,20 +1879,15 @@ dict_load_table_on_id(
/* Check if the table id in record is the one searched for */
if (table_id != mach_read_from_8(field)) {
-
- btr_pcur_close(&pcur);
- mtr_commit(&mtr);
- mem_heap_free(heap);
-
- return(NULL);
+ goto func_exit;
}
/* Now we get the table name from the record */
field = rec_get_nth_field_old(rec, 1, &len);
/* Load the table definition to memory */
table = dict_load_table(mem_heap_strdupl(heap, (char*) field, len),
- TRUE);
-
+ TRUE, DICT_ERR_IGNORE_NONE);
+func_exit:
btr_pcur_close(&pcur);
mtr_commit(&mtr);
mem_heap_free(heap);
@@ -1894,7 +1911,7 @@ dict_load_sys_table(
heap = mem_heap_create(1000);
- dict_load_indexes(table, heap);
+ dict_load_indexes(table, heap, DICT_ERR_IGNORE_NONE);
mem_heap_free(heap);
}
diff --git a/storage/innobase/dict/dict0mem.c b/storage/innobase/dict/dict0mem.c
index 8d3d78f3900..a442a3811d8 100644
--- a/storage/innobase/dict/dict0mem.c
+++ b/storage/innobase/dict/dict0mem.c
@@ -38,6 +38,9 @@ Created 1/8/1996 Heikki Tuuri
#ifndef UNIV_HOTBACKUP
# include "lock0lock.h"
#endif /* !UNIV_HOTBACKUP */
+#ifdef UNIV_BLOB_DEBUG
+# include "ut0rbt.h"
+#endif /* UNIV_BLOB_DEBUG */
#define DICT_HEAP_SIZE 100 /*!< initial memory heap size when
creating a table or index object */
@@ -380,6 +383,12 @@ dict_mem_index_free(
{
ut_ad(index);
ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+#ifdef UNIV_BLOB_DEBUG
+ if (index->blobs) {
+ mutex_free(&index->blobs_mutex);
+ rbt_free(index->blobs);
+ }
+#endif /* UNIV_BLOB_DEBUG */
mem_heap_free(index->heap);
}
diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc
index 31c7ac38c24..14870b5ee63 100644
--- a/storage/innobase/handler/ha_innodb.cc
+++ b/storage/innobase/handler/ha_innodb.cc
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 2000, 2010, MySQL AB & Innobase Oy. All Rights Reserved.
+Copyright (c) 2000, 2011, MySQL AB & Innobase Oy. All Rights Reserved.
Copyright (c) 2008, 2009 Google Inc.
Copyright (c) 2009, Percona Inc.
@@ -262,7 +262,7 @@ static PSI_mutex_info all_innodb_mutexes[] = {
# endif /* UNIV_MEM_DEBUG */
{&mem_pool_mutex_key, "mem_pool_mutex", 0},
{&mutex_list_mutex_key, "mutex_list_mutex", 0},
- {&purge_sys_mutex_key, "purge_sys_mutex", 0},
+ {&purge_sys_bh_mutex_key, "purge_sys_bh_mutex", 0},
{&recv_sys_mutex_key, "recv_sys_mutex", 0},
{&rseg_mutex_key, "rseg_mutex", 0},
# ifdef UNIV_SYNC_DEBUG
@@ -10981,16 +10981,23 @@ static MYSQL_SYSVAR_ULONG(io_capacity, srv_io_capacity,
static MYSQL_SYSVAR_ULONG(purge_batch_size, srv_purge_batch_size,
PLUGIN_VAR_OPCMDARG,
- "Number of UNDO logs to purge in one batch from the history list. "
- "Default is 20",
+ "Number of UNDO log pages to purge in one batch from the history list.",
NULL, NULL,
20, /* Default setting */
1, /* Minimum value */
5000, 0); /* Maximum value */
+static MYSQL_SYSVAR_ULONG(rollback_segments, srv_rollback_segments,
+ PLUGIN_VAR_OPCMDARG,
+ "Number of UNDO logs to use.",
+ NULL, NULL,
+ 128, /* Default setting */
+ 1, /* Minimum value */
+ TRX_SYS_N_RSEGS, 0); /* Maximum value */
+
static MYSQL_SYSVAR_ULONG(purge_threads, srv_n_purge_threads,
PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
- "Purge threads can be either 0 or 1. Default is 0.",
+ "Purge threads can be either 0 or 1.",
NULL, NULL,
0, /* Default setting */
0, /* Minimum value */
@@ -11341,6 +11348,7 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
MYSQL_SYSVAR(io_capacity),
MYSQL_SYSVAR(purge_threads),
MYSQL_SYSVAR(purge_batch_size),
+ MYSQL_SYSVAR(rollback_segments),
NULL
};
diff --git a/storage/innobase/handler/handler0alter.cc b/storage/innobase/handler/handler0alter.cc
index c0bd3121c9c..e0279c13de2 100644
--- a/storage/innobase/handler/handler0alter.cc
+++ b/storage/innobase/handler/handler0alter.cc
@@ -786,10 +786,6 @@ err_exit:
ut_ad(error == DB_SUCCESS);
- /* We will need to rebuild index translation table. Set
- valid index entry count in the translation table to zero */
- share->idx_trans_tbl.index_count = 0;
-
/* Commit the data dictionary transaction in order to release
the table locks on the system tables. This means that if
MySQL crashes while creating a new primary key inside
@@ -915,6 +911,14 @@ error:
}
convert_error:
+ if (error == DB_SUCCESS) {
+ /* Build index is successful. We will need to
+ rebuild index translation table. Reset the
+ index entry count in the translation table
+ to zero, so that translation table will be rebuilt */
+ share->idx_trans_tbl.index_count = 0;
+ }
+
error = convert_error_code_to_mysql(error,
innodb_table->flags,
user_thd);
diff --git a/storage/innobase/include/btr0btr.h b/storage/innobase/include/btr0btr.h
index e213c22937f..5b3e166371d 100644
--- a/storage/innobase/include/btr0btr.h
+++ b/storage/innobase/include/btr0btr.h
@@ -92,6 +92,91 @@ insert/delete buffer when the record is not in the buffer pool. */
buffer when the record is not in the buffer pool. */
#define BTR_DELETE 8192
+#ifdef UNIV_BLOB_DEBUG
+# include "ut0rbt.h"
+/** An index->blobs entry for keeping track of off-page column references */
+struct btr_blob_dbg_struct
+{
+ unsigned blob_page_no:32; /*!< first BLOB page number */
+ unsigned ref_page_no:32; /*!< referring page number */
+ unsigned ref_heap_no:16; /*!< referring heap number */
+ unsigned ref_field_no:10; /*!< referring field number */
+ unsigned owner:1; /*!< TRUE if BLOB owner */
+ unsigned always_owner:1; /*!< TRUE if always
+ has been the BLOB owner;
+ reset to TRUE on B-tree
+ page splits and merges */
+ unsigned del:1; /*!< TRUE if currently
+ delete-marked */
+};
+
+/**************************************************************//**
+Add a reference to an off-page column to the index->blobs map. */
+UNIV_INTERN
+void
+btr_blob_dbg_add_blob(
+/*==================*/
+ const rec_t* rec, /*!< in: clustered index record */
+ ulint field_no, /*!< in: number of off-page column */
+ ulint page_no, /*!< in: start page of the column */
+ dict_index_t* index, /*!< in/out: index tree */
+ const char* ctx) /*!< in: context (for logging) */
+ __attribute__((nonnull));
+/**************************************************************//**
+Display the references to off-page columns.
+This function is to be called from a debugger,
+for example when a breakpoint on ut_dbg_assertion_failed is hit. */
+UNIV_INTERN
+void
+btr_blob_dbg_print(
+/*===============*/
+ const dict_index_t* index) /*!< in: index tree */
+ __attribute__((nonnull));
+/**************************************************************//**
+Check that there are no references to off-page columns from or to
+the given page. Invoked when freeing or clearing a page.
+@return TRUE when no orphan references exist */
+UNIV_INTERN
+ibool
+btr_blob_dbg_is_empty(
+/*==================*/
+ dict_index_t* index, /*!< in: index */
+ ulint page_no) /*!< in: page number */
+ __attribute__((nonnull, warn_unused_result));
+
+/**************************************************************//**
+Modify the 'deleted' flag of a record. */
+UNIV_INTERN
+void
+btr_blob_dbg_set_deleted_flag(
+/*==========================*/
+ const rec_t* rec, /*!< in: record */
+ dict_index_t* index, /*!< in/out: index */
+ const ulint* offsets,/*!< in: rec_get_offs(rec, index) */
+ ibool del) /*!< in: TRUE=deleted, FALSE=exists */
+ __attribute__((nonnull));
+/**************************************************************//**
+Change the ownership of an off-page column. */
+UNIV_INTERN
+void
+btr_blob_dbg_owner(
+/*===============*/
+ const rec_t* rec, /*!< in: record */
+ dict_index_t* index, /*!< in/out: index */
+ const ulint* offsets,/*!< in: rec_get_offs(rec, index) */
+ ulint i, /*!< in: ith field in rec */
+ ibool own) /*!< in: TRUE=owned, FALSE=disowned */
+ __attribute__((nonnull));
+/** Assert that there are no BLOB references to or from the given page. */
+# define btr_blob_dbg_assert_empty(index, page_no) \
+ ut_a(btr_blob_dbg_is_empty(index, page_no))
+#else /* UNIV_BLOB_DEBUG */
+# define btr_blob_dbg_add_blob(rec, field_no, page, index, ctx) ((void) 0)
+# define btr_blob_dbg_set_deleted_flag(rec, index, offsets, del)((void) 0)
+# define btr_blob_dbg_owner(rec, index, offsets, i, val) ((void) 0)
+# define btr_blob_dbg_assert_empty(index, page_no) ((void) 0)
+#endif /* UNIV_BLOB_DEBUG */
+
/**************************************************************//**
Gets the root node of a tree and x-latches it.
@return root page, x-latched */
diff --git a/storage/innobase/include/btr0types.h b/storage/innobase/include/btr0types.h
index ef4a6b04b34..07c06fb18d7 100644
--- a/storage/innobase/include/btr0types.h
+++ b/storage/innobase/include/btr0types.h
@@ -38,6 +38,131 @@ typedef struct btr_cur_struct btr_cur_t;
/** B-tree search information for the adaptive hash index */
typedef struct btr_search_struct btr_search_t;
+#ifdef UNIV_BLOB_DEBUG
+# include "buf0types.h"
+/** An index->blobs entry for keeping track of off-page column references */
+typedef struct btr_blob_dbg_struct btr_blob_dbg_t;
+
+/** Insert to index->blobs a reference to an off-page column.
+@param index the index tree
+@param b the reference
+@param ctx context (for logging) */
+UNIV_INTERN
+void
+btr_blob_dbg_rbt_insert(
+/*====================*/
+ dict_index_t* index, /*!< in/out: index tree */
+ const btr_blob_dbg_t* b, /*!< in: the reference */
+ const char* ctx) /*!< in: context (for logging) */
+ __attribute__((nonnull));
+
+/** Remove from index->blobs a reference to an off-page column.
+@param index the index tree
+@param b the reference
+@param ctx context (for logging) */
+UNIV_INTERN
+void
+btr_blob_dbg_rbt_delete(
+/*====================*/
+ dict_index_t* index, /*!< in/out: index tree */
+ const btr_blob_dbg_t* b, /*!< in: the reference */
+ const char* ctx) /*!< in: context (for logging) */
+ __attribute__((nonnull));
+
+/**************************************************************//**
+Add to index->blobs any references to off-page columns from a record.
+@return number of references added */
+UNIV_INTERN
+ulint
+btr_blob_dbg_add_rec(
+/*=================*/
+ const rec_t* rec, /*!< in: record */
+ dict_index_t* index, /*!< in/out: index */
+ const ulint* offsets,/*!< in: offsets */
+ const char* ctx) /*!< in: context (for logging) */
+ __attribute__((nonnull));
+/**************************************************************//**
+Remove from index->blobs any references to off-page columns from a record.
+@return number of references removed */
+UNIV_INTERN
+ulint
+btr_blob_dbg_remove_rec(
+/*====================*/
+ const rec_t* rec, /*!< in: record */
+ dict_index_t* index, /*!< in/out: index */
+ const ulint* offsets,/*!< in: offsets */
+ const char* ctx) /*!< in: context (for logging) */
+ __attribute__((nonnull));
+/**************************************************************//**
+Count and add to index->blobs any references to off-page columns
+from records on a page.
+@return number of references added */
+UNIV_INTERN
+ulint
+btr_blob_dbg_add(
+/*=============*/
+ const page_t* page, /*!< in: rewritten page */
+ dict_index_t* index, /*!< in/out: index */
+ const char* ctx) /*!< in: context (for logging) */
+ __attribute__((nonnull));
+/**************************************************************//**
+Count and remove from index->blobs any references to off-page columns
+from records on a page.
+Used when reorganizing a page, before copying the records.
+@return number of references removed */
+UNIV_INTERN
+ulint
+btr_blob_dbg_remove(
+/*================*/
+ const page_t* page, /*!< in: b-tree page */
+ dict_index_t* index, /*!< in/out: index */
+ const char* ctx) /*!< in: context (for logging) */
+ __attribute__((nonnull));
+/**************************************************************//**
+Restore in index->blobs any references to off-page columns
+Used when page reorganize fails due to compressed page overflow. */
+UNIV_INTERN
+void
+btr_blob_dbg_restore(
+/*=================*/
+ const page_t* npage, /*!< in: page that failed to compress */
+ const page_t* page, /*!< in: copy of original page */
+ dict_index_t* index, /*!< in/out: index */
+ const char* ctx) /*!< in: context (for logging) */
+ __attribute__((nonnull));
+
+/** Operation that processes the BLOB references of an index record
+@param[in] rec record on index page
+@param[in/out] index the index tree of the record
+@param[in] offsets rec_get_offsets(rec,index)
+@param[in] ctx context (for logging)
+@return number of BLOB references processed */
+typedef ulint (*btr_blob_dbg_op_f)
+(const rec_t* rec,dict_index_t* index,const ulint* offsets,const char* ctx);
+
+/**************************************************************//**
+Count and process all references to off-page columns on a page.
+@return number of references processed */
+UNIV_INTERN
+ulint
+btr_blob_dbg_op(
+/*============*/
+ const page_t* page, /*!< in: B-tree leaf page */
+ const rec_t* rec, /*!< in: record to start from
+ (NULL to process the whole page) */
+ dict_index_t* index, /*!< in/out: index */
+ const char* ctx, /*!< in: context (for logging) */
+ const btr_blob_dbg_op_f op) /*!< in: operation on records */
+ __attribute__((nonnull(1,3,4,5)));
+#else /* UNIV_BLOB_DEBUG */
+# define btr_blob_dbg_add_rec(rec, index, offsets, ctx) ((void) 0)
+# define btr_blob_dbg_add(page, index, ctx) ((void) 0)
+# define btr_blob_dbg_remove_rec(rec, index, offsets, ctx) ((void) 0)
+# define btr_blob_dbg_remove(page, index, ctx) ((void) 0)
+# define btr_blob_dbg_restore(npage, page, index, ctx) ((void) 0)
+# define btr_blob_dbg_op(page, rec, index, ctx, op) ((void) 0)
+#endif /* UNIV_BLOB_DEBUG */
+
/** The size of a reference to data stored on a different page.
The reference is stored at the end of the prefix of the field
in the index record. */
diff --git a/storage/innobase/include/buf0flu.h b/storage/innobase/include/buf0flu.h
index 28d9b90e755..ae27f5dab0e 100644
--- a/storage/innobase/include/buf0flu.h
+++ b/storage/innobase/include/buf0flu.h
@@ -138,7 +138,18 @@ UNIV_INTERN
void
buf_flush_wait_batch_end(
/*=====================*/
- buf_pool_t* buf_pool, /*!< buffer pool instance */
+ buf_pool_t* buf_pool, /*!< in: buffer pool instance */
+ enum buf_flush type); /*!< in: BUF_FLUSH_LRU
+ or BUF_FLUSH_LIST */
+/******************************************************************//**
+Waits until a flush batch of the given type ends. This is called by
+a thread that only wants to wait for a flush to end but doesn't do
+any flushing itself. */
+UNIV_INTERN
+void
+buf_flush_wait_batch_end_wait_only(
+/*===============================*/
+ buf_pool_t* buf_pool, /*!< in: buffer pool instance */
enum buf_flush type); /*!< in: BUF_FLUSH_LRU
or BUF_FLUSH_LIST */
/********************************************************************//**
diff --git a/storage/innobase/include/dict0dict.h b/storage/innobase/include/dict0dict.h
index 033c435bf16..d6f2bebae3a 100644
--- a/storage/innobase/include/dict0dict.h
+++ b/storage/innobase/include/dict0dict.h
@@ -441,6 +441,18 @@ function.
@return table, NULL if not found */
UNIV_INLINE
dict_table_t*
+dict_table_get_low_ignore_err(
+/*===========================*/
+ const char* table_name, /*!< in: table name */
+ dict_err_ignore_t
+ ignore_err); /*!< in: error to be ignored when
+ loading a table definition */
+/**********************************************************************//**
+Gets a table; loads it to the dictionary cache if necessary. A low-level
+function.
+@return table, NULL if not found */
+UNIV_INLINE
+dict_table_t*
dict_table_get_low(
/*===============*/
const char* table_name); /*!< in: table name */
diff --git a/storage/innobase/include/dict0dict.ic b/storage/innobase/include/dict0dict.ic
index 42f124dedfc..59606af7056 100644
--- a/storage/innobase/include/dict0dict.ic
+++ b/storage/innobase/include/dict0dict.ic
@@ -828,6 +828,34 @@ dict_table_check_if_in_cache_low(
}
/**********************************************************************//**
+load a table into dictionary cache, ignore any error specified during load;
+@return table, NULL if not found */
+UNIV_INLINE
+dict_table_t*
+dict_table_get_low_ignore_err(
+/*==========================*/
+ const char* table_name, /*!< in: table name */
+ dict_err_ignore_t
+ ignore_err) /*!< in: error to be ignored when
+ loading a table definition */
+{
+ dict_table_t* table;
+
+ ut_ad(table_name);
+ ut_ad(mutex_own(&(dict_sys->mutex)));
+
+ table = dict_table_check_if_in_cache_low(table_name);
+
+ if (table == NULL) {
+ table = dict_load_table(table_name, TRUE, ignore_err);
+ }
+
+ ut_ad(!table || table->cached);
+
+ return(table);
+}
+
+/**********************************************************************//**
Gets a table; loads it to the dictionary cache if necessary. A low-level
function.
@return table, NULL if not found */
@@ -845,7 +873,7 @@ dict_table_get_low(
table = dict_table_check_if_in_cache_low(table_name);
if (table == NULL) {
- table = dict_load_table(table_name, TRUE);
+ table = dict_load_table(table_name, TRUE, DICT_ERR_IGNORE_NONE);
}
ut_ad(!table || table->cached);
diff --git a/storage/innobase/include/dict0load.h b/storage/innobase/include/dict0load.h
index f009f221f32..51d07f43446 100644
--- a/storage/innobase/include/dict0load.h
+++ b/storage/innobase/include/dict0load.h
@@ -170,7 +170,10 @@ dict_load_table(
/*============*/
const char* name, /*!< in: table name in the
databasename/tablename format */
- ibool cached);/*!< in: TRUE=add to cache, FALSE=do not */
+ ibool cached, /*!< in: TRUE=add to cache, FALSE=do not */
+ dict_err_ignore_t ignore_err);
+ /*!< in: error to be ignored when loading
+ table and its indexes' definition */
/***********************************************************************//**
Loads a table object based on the table id.
@return table; NULL if table does not exist */
diff --git a/storage/innobase/include/dict0mem.h b/storage/innobase/include/dict0mem.h
index 5757b79d3ed..75d3b2c3302 100644
--- a/storage/innobase/include/dict0mem.h
+++ b/storage/innobase/include/dict0mem.h
@@ -361,6 +361,8 @@ struct dict_index_struct{
/*!< TRUE if this index is marked to be
dropped in ha_innobase::prepare_drop_index(),
otherwise FALSE */
+ unsigned corrupted:1;
+ /*!< TRUE if the index object is corrupted */
dict_field_t* fields; /*!< array of field descriptions */
#ifndef UNIV_HOTBACKUP
UT_LIST_NODE_T(dict_index_t)
@@ -395,6 +397,13 @@ struct dict_index_struct{
index, or 0 if the index existed
when InnoDB was started up */
#endif /* !UNIV_HOTBACKUP */
+#ifdef UNIV_BLOB_DEBUG
+ mutex_t blobs_mutex;
+ /*!< mutex protecting blobs */
+ void* blobs; /*!< map of (page_no,heap_no,field_no)
+ to first_blob_page_no; protected by
+ blobs_mutex; @see btr_blob_dbg_t */
+#endif /* UNIV_BLOB_DEBUG */
#ifdef UNIV_DEBUG
ulint magic_n;/*!< magic number */
/** Value of dict_index_struct::magic_n */
@@ -487,6 +496,8 @@ struct dict_table_struct{
to the dictionary cache */
unsigned n_def:10;/*!< number of columns defined so far */
unsigned n_cols:10;/*!< number of columns */
+ unsigned corrupted:1;
+ /*!< TRUE if table is corrupted */
dict_col_t* cols; /*!< array of column descriptions */
const char* col_names;
/*!< Column names packed in a character string
diff --git a/storage/innobase/include/dict0types.h b/storage/innobase/include/dict0types.h
index 687209575c9..8cbd7cd5783 100644
--- a/storage/innobase/include/dict0types.h
+++ b/storage/innobase/include/dict0types.h
@@ -43,4 +43,18 @@ typedef struct tab_node_struct tab_node_t;
typedef ib_id_t table_id_t;
typedef ib_id_t index_id_t;
+/** Error to ignore when we load table dictionary into memory. However,
+the table and index will be marked as "corrupted", and caller will
+be responsible to deal with corrupted table or index.
+Note: please define the IGNORE_ERR_* as bits, so their value can
+be or-ed together */
+enum dict_err_ignore {
+ DICT_ERR_IGNORE_NONE = 0, /*!< no error to ignore */
+ DICT_ERR_IGNORE_INDEX_ROOT = 1, /*!< ignore error if index root
+ page is FIL_NUL or incorrect value */
+ DICT_ERR_IGNORE_ALL = 0xFFFF /*!< ignore all errors */
+};
+
+typedef enum dict_err_ignore dict_err_ignore_t;
+
#endif
diff --git a/storage/innobase/include/page0zip.h b/storage/innobase/include/page0zip.h
index 574809e5227..00c1d0516e6 100644
--- a/storage/innobase/include/page0zip.h
+++ b/storage/innobase/include/page0zip.h
@@ -420,7 +420,7 @@ page_zip_copy_recs(
const page_t* src, /*!< in: page */
dict_index_t* index, /*!< in: index of the B-tree */
mtr_t* mtr) /*!< in: mini-transaction */
- __attribute__((nonnull(1,2,3,4)));
+ __attribute__((nonnull));
#endif /* !UNIV_HOTBACKUP */
/**********************************************************************//**
diff --git a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h
index d600ad4a034..252d1424c63 100644
--- a/storage/innobase/include/srv0srv.h
+++ b/storage/innobase/include/srv0srv.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2011, Innobase Oy. All Rights Reserved.
Copyright (c) 2008, 2009, Google Inc.
Copyright (c) 2009, Percona Inc.
@@ -294,9 +294,12 @@ extern ulint srv_log_waits;
/* the number of purge threads to use from the worker pool (currently 0 or 1) */
extern ulong srv_n_purge_threads;
-/* the number of records to purge in one batch */
+/* the number of pages to purge in one batch */
extern ulong srv_purge_batch_size;
+/* the number of rollback segments to use */
+extern ulong srv_rollback_segments;
+
/* variable that counts amount of data read in total (in bytes) */
extern ulint srv_data_read;
diff --git a/storage/innobase/include/sync0sync.h b/storage/innobase/include/sync0sync.h
index fb3a24c5ee5..b4f9e21933f 100644
--- a/storage/innobase/include/sync0sync.h
+++ b/storage/innobase/include/sync0sync.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2011, Innobase Oy. All Rights Reserved.
Copyright (c) 2008, Google Inc.
Portions of this file contain modifications contributed and copyrighted by
@@ -93,7 +93,7 @@ extern mysql_pfs_key_t mem_hash_mutex_key;
# endif /* UNIV_MEM_DEBUG */
extern mysql_pfs_key_t mem_pool_mutex_key;
extern mysql_pfs_key_t mutex_list_mutex_key;
-extern mysql_pfs_key_t purge_sys_mutex_key;
+extern mysql_pfs_key_t purge_sys_bh_mutex_key;
extern mysql_pfs_key_t recv_sys_mutex_key;
extern mysql_pfs_key_t rseg_mutex_key;
# ifdef UNIV_SYNC_DEBUG
@@ -637,7 +637,6 @@ or row lock! */
#define SYNC_TREE_NODE_NEW 892
#define SYNC_TREE_NODE_FROM_HASH 891
#define SYNC_TREE_NODE 890
-#define SYNC_PURGE_SYS 810
#define SYNC_PURGE_LATCH 800
#define SYNC_TRX_UNDO 700
#define SYNC_RSEG 600
@@ -659,6 +658,7 @@ or row lock! */
#define SYNC_REC_LOCK 299
#define SYNC_TRX_LOCK_HEAP 298
#define SYNC_TRX_SYS_HEADER 290
+#define SYNC_PURGE_QUEUE 200
#define SYNC_LOG 170
#define SYNC_LOG_FLUSH_ORDER 147
#define SYNC_RECV 168
diff --git a/storage/innobase/include/trx0purge.h b/storage/innobase/include/trx0purge.h
index d2730a68a78..0b83a76cab7 100644
--- a/storage/innobase/include/trx0purge.h
+++ b/storage/innobase/include/trx0purge.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2011, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -68,8 +68,9 @@ Creates the global purge system control structure and inits the history
mutex. */
UNIV_INTERN
void
-trx_purge_sys_create(void);
-/*======================*/
+trx_purge_sys_create(
+/*=================*/
+ ib_bh_t* ib_bh); /*!< in/own: UNDO log min binary heap*/
/********************************************************************//**
Frees the global purge system control structure. */
UNIV_INTERN
@@ -128,20 +129,20 @@ struct trx_purge_struct{
ulint state; /*!< Purge system state */
sess_t* sess; /*!< System session running the purge
query */
- trx_t* trx; /*!< System transaction running the purge
+ trx_t* trx; /*!< System transaction running the
+ purge
query: this trx is not in the trx list
of the trx system and it never ends */
que_t* query; /*!< The query graph which will do the
parallelized purge operation */
- rw_lock_t latch; /*!< The latch protecting the purge view.
- A purge operation must acquire an
- x-latch here for the instant at which
+ rw_lock_t latch; /*!< The latch protecting the purge
+ view. A purge operation must acquire
+ an x-latch here for the instant at which
it changes the purge view: an undo
log operation can prevent this by
obtaining an s-latch here. */
read_view_t* view; /*!< The purge will not remove undo logs
which are >= this view (purge view) */
- mutex_t mutex; /*!< Mutex protecting the fields below */
ulint n_pages_handled;/*!< Approximate number of undo log
pages processed in purge */
ulint handle_limit; /*!< Target of how many pages to get
@@ -179,6 +180,11 @@ struct trx_purge_struct{
mem_heap_t* heap; /*!< Temporary storage used during a
purge: can be emptied after purge
completes */
+ /*-----------------------------*/
+ ib_bh_t* ib_bh; /*!< Binary min-heap, ordered on
+ rseg_queue_t::trx_no. It is protected
+ by the bh_mutex */
+ mutex_t bh_mutex; /*!< Mutex protecting ib_bh */
};
#define TRX_PURGE_ON 1 /* purge operation is running */
diff --git a/storage/innobase/include/trx0rseg.h b/storage/innobase/include/trx0rseg.h
index a293d9b896e..5acde05de3d 100644
--- a/storage/innobase/include/trx0rseg.h
+++ b/storage/innobase/include/trx0rseg.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2011, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -113,7 +113,9 @@ void
trx_rseg_list_and_array_init(
/*=========================*/
trx_sysf_t* sys_header, /*!< in: trx system header */
+ ib_bh_t* ib_bh, /*!< in: rseg queue */
mtr_t* mtr); /*!< in: mtr */
+
/***************************************************************************
Free's an instance of the rollback segment in memory. */
UNIV_INTERN
@@ -180,6 +182,14 @@ struct trx_rseg_struct{
memory objects */
};
+/** For prioritising the rollback segments for purge. */
+struct rseg_queue_struct {
+ trx_id_t trx_no; /*!< trx_rseg_t::last_trx_no */
+ trx_rseg_t* rseg; /*!< Rollback segment */
+};
+
+typedef struct rseg_queue_struct rseg_queue_t;
+
/* Undo log segment slot in a rollback segment header */
/*-------------------------------------------------------------*/
#define TRX_RSEG_SLOT_PAGE_NO 0 /* Page number of the header page of
diff --git a/storage/innobase/include/trx0sys.h b/storage/innobase/include/trx0sys.h
index 63e3f6be934..dc0ca2285b9 100644
--- a/storage/innobase/include/trx0sys.h
+++ b/storage/innobase/include/trx0sys.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2011, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -38,6 +38,7 @@ Created 3/26/1996 Heikki Tuuri
#include "mem0mem.h"
#include "sync0sync.h"
#include "ut0lst.h"
+#include "ut0bh.h"
#include "read0types.h"
#include "page0types.h"
@@ -221,13 +222,6 @@ UNIV_INLINE
trx_id_t
trx_sys_get_new_trx_id(void);
/*========================*/
-/*****************************************************************//**
-Allocates a new transaction number.
-@return new, allocated trx number */
-UNIV_INLINE
-trx_id_t
-trx_sys_get_new_trx_no(void);
-/*========================*/
#endif /* !UNIV_HOTBACKUP */
/*****************************************************************//**
Writes a trx id to an index page. In case that the id size changes in
diff --git a/storage/innobase/include/trx0sys.ic b/storage/innobase/include/trx0sys.ic
index 385c7f4f0cc..355f118a1ec 100644
--- a/storage/innobase/include/trx0sys.ic
+++ b/storage/innobase/include/trx0sys.ic
@@ -369,16 +369,4 @@ trx_sys_get_new_trx_id(void)
return(id);
}
-/*****************************************************************//**
-Allocates a new transaction number.
-@return new, allocated trx number */
-UNIV_INLINE
-trx_id_t
-trx_sys_get_new_trx_no(void)
-/*========================*/
-{
- ut_ad(mutex_own(&kernel_mutex));
-
- return(trx_sys_get_new_trx_id());
-}
#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/include/univ.i b/storage/innobase/include/univ.i
index a99b8306eb8..f561226a2de 100644
--- a/storage/innobase/include/univ.i
+++ b/storage/innobase/include/univ.i
@@ -51,7 +51,7 @@ Created 1/20/1994 Heikki Tuuri
#define INNODB_VERSION_MAJOR 1
#define INNODB_VERSION_MINOR 1
-#define INNODB_VERSION_BUGFIX 5
+#define INNODB_VERSION_BUGFIX 6
/* The following is the InnoDB version as shown in
SELECT plugin_version FROM information_schema.plugins;
@@ -201,6 +201,8 @@ this will break redo log file compatibility, but it may be useful when
debugging redo log application problems. */
#define UNIV_MEM_DEBUG /* detect memory leaks etc */
#define UNIV_IBUF_DEBUG /* debug the insert buffer */
+#define UNIV_BLOB_DEBUG /* track BLOB ownership;
+assumes that no BLOBs survive server restart */
#define UNIV_IBUF_COUNT_DEBUG /* debug the insert buffer;
this limits the database to IBUF_COUNT_N_SPACES and IBUF_COUNT_N_PAGES,
and the insert buffer must be empty when the database is started */
diff --git a/storage/innobase/include/ut0bh.h b/storage/innobase/include/ut0bh.h
new file mode 100644
index 00000000000..1b211390283
--- /dev/null
+++ b/storage/innobase/include/ut0bh.h
@@ -0,0 +1,152 @@
+/***************************************************************************//**
+
+Copyright (c) 2011, Oracle Corpn. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/ut0bh.h
+Binary min-heap interface.
+
+Created 2010-05-28 by Sunny Bains
+*******************************************************/
+
+#ifndef INNOBASE_UT0BH_H
+#define INNOBASE_UT0BH_H
+
+#include "univ.i"
+
+/** Comparison function for objects in the binary heap. */
+typedef int (*ib_bh_cmp_t)(const void* p1, const void* p2);
+
+typedef struct ib_bh_struct ib_bh_t;
+
+/**********************************************************************//**
+Get the number of elements in the binary heap.
+@return number of elements */
+UNIV_INLINE
+ulint
+ib_bh_size(
+/*=======*/
+ const ib_bh_t* ib_bh); /*!< in: instance */
+
+/**********************************************************************//**
+Test if binary heap is empty.
+@return TRUE if empty. */
+UNIV_INLINE
+ibool
+ib_bh_is_empty(
+/*===========*/
+ const ib_bh_t* ib_bh); /*!< in: instance */
+
+/**********************************************************************//**
+Test if binary heap is full.
+@return TRUE if full. */
+UNIV_INLINE
+ibool
+ib_bh_is_full(
+/*===========*/
+ const ib_bh_t* ib_bh); /*!< in: instance */
+
+/**********************************************************************//**
+Get a pointer to the element.
+@return pointer to element */
+UNIV_INLINE
+void*
+ib_bh_get(
+/*=======*/
+ ib_bh_t* ib_bh, /*!< in: instance */
+ ulint i); /*!< in: index */
+
+/**********************************************************************//**
+Copy an element to the binary heap.
+@return pointer to copied element */
+UNIV_INLINE
+void*
+ib_bh_set(
+/*======*/
+ ib_bh_t* ib_bh, /*!< in/out: instance */
+ ulint i, /*!< in: index */
+ const void* elem); /*!< in: element to add */
+
+/**********************************************************************//**
+Return the first element from the binary heap.
+@return pointer to first element or NULL if empty. */
+UNIV_INLINE
+void*
+ib_bh_first(
+/*========*/
+ ib_bh_t* ib_bh); /*!< in: instance */
+
+/**********************************************************************//**
+Return the last element from the binary heap.
+@return pointer to last element or NULL if empty. */
+UNIV_INLINE
+void*
+ib_bh_last(
+/*========*/
+ ib_bh_t* ib_bh); /*!< in/out: instance */
+
+/**********************************************************************//**
+Create a binary heap.
+@return a new binary heap */
+UNIV_INTERN
+ib_bh_t*
+ib_bh_create(
+/*=========*/
+ ib_bh_cmp_t compare, /*!< in: comparator */
+ ulint sizeof_elem, /*!< in: size of one element */
+ ulint max_elems); /*!< in: max elements allowed */
+
+/**********************************************************************//**
+Free a binary heap.
+@return a new binary heap */
+UNIV_INTERN
+void
+ib_bh_free(
+/*=======*/
+ ib_bh_t* ib_bh); /*!< in,own: instance */
+
+/**********************************************************************//**
+Add an element to the binary heap. Note: The element is copied.
+@return pointer to added element or NULL if full. */
+UNIV_INTERN
+void*
+ib_bh_push(
+/*=======*/
+ ib_bh_t* ib_bh, /*!< in/out: instance */
+ const void* elem); /*!< in: element to add */
+
+/**********************************************************************//**
+Remove the first element from the binary heap. */
+UNIV_INTERN
+void
+ib_bh_pop(
+/*======*/
+ ib_bh_t* ib_bh); /*!< in/out: instance */
+
+/** Binary heap data structure */
+struct ib_bh_struct {
+ ulint max_elems; /*!< max elements allowed */
+ ulint n_elems; /*!< current size */
+ ulint sizeof_elem; /*!< sizeof element */
+ ib_bh_cmp_t compare; /*!< comparator */
+};
+
+#ifndef UNIV_NONINL
+#include "ut0bh.ic"
+#endif
+
+#endif /* INNOBASE_UT0BH_H */
diff --git a/storage/innobase/include/ut0bh.ic b/storage/innobase/include/ut0bh.ic
new file mode 100644
index 00000000000..afbe58e7e3b
--- /dev/null
+++ b/storage/innobase/include/ut0bh.ic
@@ -0,0 +1,125 @@
+/***************************************************************************//**
+Copyright (c) 2011, Oracle Corpn. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/ut0bh.ic
+Binary min-heap implementation.
+
+Created 2011-01-15 by Sunny Bains
+*******************************************************/
+
+#include "ut0bh.h"
+#include "ut0mem.h" /* For ut_memcpy() */
+
+/**********************************************************************//**
+Get the number of elements in the binary heap.
+@return number of elements */
+UNIV_INLINE
+ulint
+ib_bh_size(
+/*=======*/
+ const ib_bh_t* ib_bh) /*!< in: instance */
+{
+ return(ib_bh->n_elems);
+}
+
+/**********************************************************************//**
+Test if binary heap is empty.
+@return TRUE if empty. */
+UNIV_INLINE
+ibool
+ib_bh_is_empty(
+/*===========*/
+ const ib_bh_t* ib_bh) /*!< in: instance */
+{
+ return(ib_bh_size(ib_bh) == 0);
+}
+
+/**********************************************************************//**
+Test if binary heap is full.
+@return TRUE if full. */
+UNIV_INLINE
+ibool
+ib_bh_is_full(
+/*===========*/
+ const ib_bh_t* ib_bh) /*!< in: instance */
+{
+ return(ib_bh_size(ib_bh) >= ib_bh->max_elems);
+}
+
+/**********************************************************************//**
+Get a pointer to the element.
+@return pointer to element */
+UNIV_INLINE
+void*
+ib_bh_get(
+/*=======*/
+ ib_bh_t* ib_bh, /*!< in: instance */
+ ulint i) /*!< in: index */
+{
+ byte* ptr = (byte*) (ib_bh + 1);
+
+ ut_a(i < ib_bh_size(ib_bh));
+
+ return(ptr + (ib_bh->sizeof_elem * i));
+}
+
+/**********************************************************************//**
+Copy an element to the binary heap.
+@return pointer to copied element */
+UNIV_INLINE
+void*
+ib_bh_set(
+/*======*/
+ ib_bh_t* ib_bh, /*!< in/out: instance */
+ ulint i, /*!< in: index */
+ const void* elem) /*!< in: element to add */
+{
+ void* ptr = ib_bh_get(ib_bh, i);
+
+ ut_memcpy(ptr, elem, ib_bh->sizeof_elem);
+
+ return(ptr);
+}
+
+/**********************************************************************//**
+Return the first element from the binary heap.
+@return pointer to first element or NULL if empty. */
+UNIV_INLINE
+void*
+ib_bh_first(
+/*========*/
+ ib_bh_t* ib_bh) /*!< in: instance */
+{
+ return(ib_bh_is_empty(ib_bh) ? NULL : ib_bh_get(ib_bh, 0));
+}
+
+/**********************************************************************//**
+Return the last element from the binary heap.
+@return pointer to last element or NULL if empty. */
+UNIV_INLINE
+void*
+ib_bh_last(
+/*========*/
+ ib_bh_t* ib_bh) /*!< in/out: instance */
+{
+ return(ib_bh_is_empty(ib_bh)
+ ? NULL
+ : ib_bh_get(ib_bh, ib_bh_size(ib_bh) - 1));
+}
+
+
diff --git a/storage/innobase/page/page0cur.c b/storage/innobase/page/page0cur.c
index f10f16a7dd9..936762b986a 100644
--- a/storage/innobase/page/page0cur.c
+++ b/storage/innobase/page/page0cur.c
@@ -1149,6 +1149,8 @@ use_heap:
current_rec, index, mtr);
}
+ btr_blob_dbg_add_rec(insert_rec, index, offsets, "insert");
+
return(insert_rec);
}
@@ -1195,10 +1197,12 @@ page_cur_insert_rec_zip_reorg(
}
/* Out of space: restore the page */
+ btr_blob_dbg_remove(page, index, "insert_zip_fail");
if (!page_zip_decompress(page_zip, page, FALSE)) {
ut_error; /* Memory corrupted? */
}
ut_ad(page_validate(page, index));
+ btr_blob_dbg_add(page, index, "insert_zip_fail");
return(NULL);
}
@@ -1490,6 +1494,8 @@ use_heap:
page_zip_write_rec(page_zip, insert_rec, index, offsets, 1);
+ btr_blob_dbg_add_rec(insert_rec, index, offsets, "insert_zip_ok");
+
/* 9. Write log record of the insert */
if (UNIV_LIKELY(mtr != NULL)) {
page_cur_insert_rec_write_log(insert_rec, rec_size,
@@ -1697,6 +1703,9 @@ page_copy_rec_list_end_to_created_page(
heap_top += rec_size;
+ rec_offs_make_valid(insert_rec, index, offsets);
+ btr_blob_dbg_add_rec(insert_rec, index, offsets, "copy_end");
+
page_cur_insert_rec_write_log(insert_rec, rec_size, prev_rec,
index, mtr);
prev_rec = insert_rec;
@@ -1944,6 +1953,7 @@ page_cur_delete_rec(
page_dir_slot_set_n_owned(cur_dir_slot, page_zip, cur_n_owned - 1);
/* 6. Free the memory occupied by the record */
+ btr_blob_dbg_remove_rec(current_rec, index, offsets, "delete");
page_mem_free(page, page_zip, current_rec, index, offsets);
/* 7. Now we have decremented the number of owned records of the slot.
diff --git a/storage/innobase/page/page0page.c b/storage/innobase/page/page0page.c
index 2e785412ac9..99182e85849 100644
--- a/storage/innobase/page/page0page.c
+++ b/storage/innobase/page/page0page.c
@@ -685,12 +685,16 @@ page_copy_rec_list_end(
if (UNIV_UNLIKELY
(!page_zip_reorganize(new_block, index, mtr))) {
+ btr_blob_dbg_remove(new_page, index,
+ "copy_end_reorg_fail");
if (UNIV_UNLIKELY
(!page_zip_decompress(new_page_zip,
new_page, FALSE))) {
ut_error;
}
ut_ad(page_validate(new_page, index));
+ btr_blob_dbg_add(new_page, index,
+ "copy_end_reorg_fail");
return(NULL);
} else {
/* The page was reorganized:
@@ -803,12 +807,16 @@ page_copy_rec_list_start(
if (UNIV_UNLIKELY
(!page_zip_reorganize(new_block, index, mtr))) {
+ btr_blob_dbg_remove(new_page, index,
+ "copy_start_reorg_fail");
if (UNIV_UNLIKELY
(!page_zip_decompress(new_page_zip,
new_page, FALSE))) {
ut_error;
}
ut_ad(page_validate(new_page, index));
+ btr_blob_dbg_add(new_page, index,
+ "copy_start_reorg_fail");
return(NULL);
} else {
/* The page was reorganized:
@@ -1080,6 +1088,9 @@ page_delete_rec_list_end(
/* Remove the record chain segment from the record chain */
page_rec_set_next(prev_rec, page_get_supremum_rec(page));
+ btr_blob_dbg_op(page, rec, index, "delete_end",
+ btr_blob_dbg_remove_rec);
+
/* Catenate the deleted chain segment to the page free list */
page_rec_set_next(last_rec, page_header_get_ptr(page, PAGE_FREE));
diff --git a/storage/innobase/page/page0zip.c b/storage/innobase/page/page0zip.c
index 4c5371370da..704b0c88e02 100644
--- a/storage/innobase/page/page0zip.c
+++ b/storage/innobase/page/page0zip.c
@@ -4452,6 +4452,8 @@ page_zip_reorganize(
/* Copy the old page to temporary space */
buf_frame_copy(temp_page, page);
+ btr_blob_dbg_remove(page, index, "zip_reorg");
+
/* Recreate the page: note that global data on page (possible
segment headers, next page-field, etc.) is preserved intact */
@@ -4510,7 +4512,7 @@ page_zip_copy_recs(
mtr_t* mtr) /*!< in: mini-transaction */
{
ut_ad(mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_X_FIX));
- ut_ad(mtr_memo_contains_page(mtr, (page_t*) src, MTR_MEMO_PAGE_X_FIX));
+ ut_ad(mtr_memo_contains_page(mtr, src, MTR_MEMO_PAGE_X_FIX));
ut_ad(!dict_index_is_ibuf(index));
#ifdef UNIV_ZIP_DEBUG
/* The B-tree operations that call this function may set
@@ -4580,6 +4582,7 @@ page_zip_copy_recs(
#ifdef UNIV_ZIP_DEBUG
ut_a(page_zip_validate(page_zip, page));
#endif /* UNIV_ZIP_DEBUG */
+ btr_blob_dbg_add(page, index, "page_zip_copy_recs");
page_zip_compress_write_log(page_zip, page, index, mtr);
}
diff --git a/storage/innobase/row/row0mysql.c b/storage/innobase/row/row0mysql.c
index 31aa49d4267..f7e5c5fdceb 100644
--- a/storage/innobase/row/row0mysql.c
+++ b/storage/innobase/row/row0mysql.c
@@ -3120,7 +3120,7 @@ row_drop_table_for_mysql(
ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
#endif /* UNIV_SYNC_DEBUG */
- table = dict_table_get_low(name);
+ table = dict_table_get_low_ignore_err(name, DICT_ERR_IGNORE_INDEX_ROOT);
if (!table) {
err = DB_TABLE_NOT_FOUND;
@@ -3355,7 +3355,7 @@ check_next_foreign:
dict_table_remove_from_cache(table);
- if (dict_load_table(name, TRUE) != NULL) {
+ if (dict_load_table(name, TRUE, DICT_ERR_IGNORE_NONE) != NULL) {
ut_print_timestamp(stderr);
fputs(" InnoDB: Error: not able to remove table ",
stderr);
@@ -3501,7 +3501,7 @@ row_mysql_drop_temp_tables(void)
btr_pcur_store_position(&pcur, &mtr);
btr_pcur_commit_specify_mtr(&pcur, &mtr);
- table = dict_load_table(table_name, TRUE);
+ table = dict_load_table(table_name, TRUE, DICT_ERR_IGNORE_NONE);
if (table) {
row_drop_table_for_mysql(table_name, trx, FALSE);
diff --git a/storage/innobase/row/row0upd.c b/storage/innobase/row/row0upd.c
index c6bff0fccd6..ea71a16bed2 100644
--- a/storage/innobase/row/row0upd.c
+++ b/storage/innobase/row/row0upd.c
@@ -498,14 +498,49 @@ row_upd_rec_in_place(
n_fields = upd_get_n_fields(update);
for (i = 0; i < n_fields; i++) {
+#ifdef UNIV_BLOB_DEBUG
+ btr_blob_dbg_t b;
+ const byte* field_ref = NULL;
+#endif /* UNIV_BLOB_DEBUG */
+
upd_field = upd_get_nth_field(update, i);
new_val = &(upd_field->new_val);
ut_ad(!dfield_is_ext(new_val) ==
!rec_offs_nth_extern(offsets, upd_field->field_no));
+#ifdef UNIV_BLOB_DEBUG
+ if (dfield_is_ext(new_val)) {
+ ulint len;
+ field_ref = rec_get_nth_field(rec, offsets, i, &len);
+ ut_a(len != UNIV_SQL_NULL);
+ ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
+ field_ref += len - BTR_EXTERN_FIELD_REF_SIZE;
+
+ b.ref_page_no = page_get_page_no(page_align(rec));
+ b.ref_heap_no = page_rec_get_heap_no(rec);
+ b.ref_field_no = i;
+ b.blob_page_no = mach_read_from_4(
+ field_ref + BTR_EXTERN_PAGE_NO);
+ ut_a(b.ref_field_no >= index->n_uniq);
+ btr_blob_dbg_rbt_delete(index, &b, "upd_in_place");
+ }
+#endif /* UNIV_BLOB_DEBUG */
rec_set_nth_field(rec, offsets, upd_field->field_no,
dfield_get_data(new_val),
dfield_get_len(new_val));
+
+#ifdef UNIV_BLOB_DEBUG
+ if (dfield_is_ext(new_val)) {
+ b.blob_page_no = mach_read_from_4(
+ field_ref + BTR_EXTERN_PAGE_NO);
+ b.always_owner = b.owner = !(field_ref[BTR_EXTERN_LEN]
+ & BTR_EXTERN_OWNER_FLAG);
+ b.del = rec_get_deleted_flag(
+ rec, rec_offs_comp(offsets));
+
+ btr_blob_dbg_rbt_insert(index, &b, "upd_in_place");
+ }
+#endif /* UNIV_BLOB_DEBUG */
}
if (UNIV_LIKELY_NULL(page_zip)) {
diff --git a/storage/innobase/srv/srv0srv.c b/storage/innobase/srv/srv0srv.c
index dcfd5463fbc..3e10aa03096 100644
--- a/storage/innobase/srv/srv0srv.c
+++ b/storage/innobase/srv/srv0srv.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2008, 2009 Google Inc.
Copyright (c) 2009, Percona Inc.
@@ -269,9 +269,12 @@ UNIV_INTERN ulong srv_max_buf_pool_modified_pct = 75;
/* the number of purge threads to use from the worker pool (currently 0 or 1).*/
UNIV_INTERN ulong srv_n_purge_threads = 0;
-/* the number of records to purge in one batch */
+/* the number of pages to purge in one batch */
UNIV_INTERN ulong srv_purge_batch_size = 20;
+/* the number of rollback segments to use */
+UNIV_INTERN ulong srv_rollback_segments = TRX_SYS_N_RSEGS;
+
/* variable counts amount of data read in total (in bytes) */
UNIV_INTERN ulint srv_data_read = 0;
@@ -3096,6 +3099,7 @@ srv_purge_thread(
required by os_thread_create */
{
srv_slot_t* slot;
+ ulint retries = 0;
ulint slot_no = ULINT_UNDEFINED;
ulint n_total_purged = ULINT_UNDEFINED;
@@ -3122,7 +3126,7 @@ srv_purge_thread(
while (srv_shutdown_state != SRV_SHUTDOWN_EXIT_THREADS) {
- ulint n_pages_purged;
+ ulint n_pages_purged = 0;
/* If there are very few records to purge or the last
purge didn't purge any records then wait for activity.
@@ -3130,7 +3134,8 @@ srv_purge_thread(
because in the worst case we will end up waiting for
the next purge event. */
if (trx_sys->rseg_history_len < srv_purge_batch_size
- || n_total_purged == 0) {
+ || (n_total_purged == 0
+ && retries >= TRX_SYS_N_RSEGS)) {
os_event_t event;
@@ -3141,6 +3146,8 @@ srv_purge_thread(
mutex_exit(&kernel_mutex);
os_event_wait(event);
+
+ retries = 0;
}
/* Check for shutdown and whether we should do purge at all. */
@@ -3151,7 +3158,12 @@ srv_purge_thread(
break;
}
- n_total_purged = 0;
+ if (n_total_purged == 0 && retries <= TRX_SYS_N_RSEGS) {
+ ++retries;
+ } else if (n_total_purged > 0) {
+ retries = 0;
+ n_total_purged = 0;
+ }
/* Purge until there are no more records to purge and there is
no change in configuration or server state. */
diff --git a/storage/innobase/srv/srv0start.c b/storage/innobase/srv/srv0start.c
index 2a9ad0e7541..050fe460652 100644
--- a/storage/innobase/srv/srv0start.c
+++ b/storage/innobase/srv/srv0start.c
@@ -1083,6 +1083,12 @@ innobase_start_or_create_for_mysql(void)
# endif
#endif
+#ifdef UNIV_BLOB_DEBUG
+ fprintf(stderr,
+ "InnoDB: !!!!!!!! UNIV_BLOB_DEBUG switched on !!!!!!!!!\n"
+ "InnoDB: Server restart may fail with UNIV_BLOB_DEBUG\n");
+#endif /* UNIV_BLOB_DEBUG */
+
#ifdef UNIV_SYNC_DEBUG
ut_print_timestamp(stderr);
fprintf(stderr,
diff --git a/storage/innobase/sync/sync0rw.c b/storage/innobase/sync/sync0rw.c
index 295a010f9d7..73d1e3aa46e 100644
--- a/storage/innobase/sync/sync0rw.c
+++ b/storage/innobase/sync/sync0rw.c
@@ -271,6 +271,9 @@ rw_lock_create_func(
contains garbage at initialization and cannot be used for
recursive x-locking. */
lock->recursive = FALSE;
+ /* Silence Valgrind when UNIV_DEBUG_VALGRIND is not enabled. */
+ memset((void*) &lock->writer_thread, 0, sizeof lock->writer_thread);
+ UNIV_MEM_INVALID(&lock->writer_thread, sizeof lock->writer_thread);
#ifdef UNIV_SYNC_DEBUG
UT_LIST_INIT(lock->debug_list);
diff --git a/storage/innobase/sync/sync0sync.c b/storage/innobase/sync/sync0sync.c
index 453314f465d..0f6a60ca260 100644
--- a/storage/innobase/sync/sync0sync.c
+++ b/storage/innobase/sync/sync0sync.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2011, Innobase Oy. All Rights Reserved.
Copyright (c) 2008, Google Inc.
Portions of this file contain modifications contributed and copyrighted by
@@ -1172,7 +1172,7 @@ sync_thread_add_level(
case SYNC_RSEG:
case SYNC_TRX_UNDO:
case SYNC_PURGE_LATCH:
- case SYNC_PURGE_SYS:
+ case SYNC_PURGE_QUEUE:
case SYNC_DICT_AUTOINC_MUTEX:
case SYNC_DICT_OPERATION:
case SYNC_DICT_HEADER:
@@ -1239,10 +1239,16 @@ sync_thread_add_level(
|| sync_thread_levels_g(array, SYNC_FSP, TRUE));
break;
case SYNC_TRX_UNDO_PAGE:
+ /* Purge is allowed to read in as many UNDO pages as it likes,
+ there was a bogus rule here earlier that forced the caller to
+ acquire the purge_sys_t::mutex. The purge mutex did not really
+ protect anything because it was only ever acquired by the
+ single purge thread. The purge thread can read the UNDO pages
+ without any covering mutex. */
+
ut_a(sync_thread_levels_contain(array, SYNC_TRX_UNDO)
|| sync_thread_levels_contain(array, SYNC_RSEG)
- || sync_thread_levels_contain(array, SYNC_PURGE_SYS)
- || sync_thread_levels_g(array, SYNC_TRX_UNDO_PAGE, TRUE));
+ || sync_thread_levels_g(array, level - 1, TRUE));
break;
case SYNC_RSEG_HEADER:
ut_a(sync_thread_levels_contain(array, SYNC_RSEG));
diff --git a/storage/innobase/trx/trx0purge.c b/storage/innobase/trx/trx0purge.c
index 4c787579a03..02ec9f1c072 100644
--- a/storage/innobase/trx/trx0purge.c
+++ b/storage/innobase/trx/trx0purge.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2011, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -57,8 +57,8 @@ UNIV_INTERN mysql_pfs_key_t trx_purge_latch_key;
#endif /* UNIV_PFS_RWLOCK */
#ifdef UNIV_PFS_MUTEX
-/* Key to register purge_sys_mutex with performance schema */
-UNIV_INTERN mysql_pfs_key_t purge_sys_mutex_key;
+/* Key to register purge_sys_bh_mutex with performance schema */
+UNIV_INTERN mysql_pfs_key_t purge_sys_bh_mutex_key;
#endif /* UNIV_PFS_MUTEX */
/*****************************************************************//**
@@ -219,13 +219,16 @@ Creates the global purge system control structure and inits the history
mutex. */
UNIV_INTERN
void
-trx_purge_sys_create(void)
-/*======================*/
+trx_purge_sys_create(
+/*=================*/
+ ib_bh_t* ib_bh) /*!< in, own: UNDO log min binary heap */
{
ut_ad(mutex_own(&kernel_mutex));
- purge_sys = mem_alloc(sizeof(trx_purge_t));
+ purge_sys = mem_zalloc(sizeof(trx_purge_t));
+ /* Take ownership of ib_bh, we are responsible for freeing it. */
+ purge_sys->ib_bh = ib_bh;
purge_sys->state = TRX_STOP_PURGE;
purge_sys->n_pages_handled = 0;
@@ -237,8 +240,9 @@ trx_purge_sys_create(void)
rw_lock_create(trx_purge_latch_key,
&purge_sys->latch, SYNC_PURGE_LATCH);
- mutex_create(purge_sys_mutex_key,
- &purge_sys->mutex, SYNC_PURGE_SYS);
+ mutex_create(
+ purge_sys_bh_mutex_key, &purge_sys->bh_mutex,
+ SYNC_PURGE_QUEUE);
purge_sys->heap = mem_heap_create(256);
@@ -288,9 +292,12 @@ trx_purge_sys_close(void)
trx_undo_arr_free(purge_sys->arr);
rw_lock_free(&purge_sys->latch);
- mutex_free(&purge_sys->mutex);
+ mutex_free(&purge_sys->bh_mutex);
mem_heap_free(purge_sys->heap);
+
+ ib_bh_free(purge_sys->ib_bh);
+
mem_free(purge_sys);
purge_sys = NULL;
@@ -311,34 +318,31 @@ trx_purge_add_update_undo_to_history(
mtr_t* mtr) /*!< in: mtr */
{
trx_undo_t* undo;
- trx_rseg_t* rseg;
trx_rsegf_t* rseg_header;
-#ifdef UNIV_DEBUG
- trx_usegf_t* seg_header;
-#endif /* UNIV_DEBUG */
trx_ulogf_t* undo_header;
- ulint hist_size;
undo = trx->update_undo;
ut_ad(undo);
- rseg = undo->rseg;
+ ut_ad(mutex_own(&undo->rseg->mutex));
- ut_ad(mutex_own(&(rseg->mutex)));
-
- rseg_header = trx_rsegf_get(rseg->space, rseg->zip_size,
- rseg->page_no, mtr);
+ rseg_header = trx_rsegf_get(
+ undo->rseg->space, undo->rseg->zip_size, undo->rseg->page_no,
+ mtr);
undo_header = undo_page + undo->hdr_offset;
+ /* Add the log as the first in the history list */
+
+ if (undo->state != TRX_UNDO_CACHED) {
+ ulint hist_size;
#ifdef UNIV_DEBUG
- seg_header = undo_page + TRX_UNDO_SEG_HDR;
+ trx_usegf_t* seg_header = undo_page + TRX_UNDO_SEG_HDR;
#endif /* UNIV_DEBUG */
- if (undo->state != TRX_UNDO_CACHED) {
/* The undo log segment will not be reused */
- if (undo->id >= TRX_RSEG_N_SLOTS) {
+ if (UNIV_UNLIKELY(undo->id >= TRX_RSEG_N_SLOTS)) {
fprintf(stderr,
"InnoDB: Error: undo->id is %lu\n",
(ulong) undo->id);
@@ -347,42 +351,50 @@ trx_purge_add_update_undo_to_history(
trx_rsegf_set_nth_undo(rseg_header, undo->id, FIL_NULL, mtr);
- hist_size = mtr_read_ulint(rseg_header + TRX_RSEG_HISTORY_SIZE,
- MLOG_4BYTES, mtr);
+ hist_size = mtr_read_ulint(
+ rseg_header + TRX_RSEG_HISTORY_SIZE, MLOG_4BYTES, mtr);
+
ut_ad(undo->size == flst_get_len(
seg_header + TRX_UNDO_PAGE_LIST, mtr));
- mlog_write_ulint(rseg_header + TRX_RSEG_HISTORY_SIZE,
- hist_size + undo->size, MLOG_4BYTES, mtr);
+ mlog_write_ulint(
+ rseg_header + TRX_RSEG_HISTORY_SIZE,
+ hist_size + undo->size, MLOG_4BYTES, mtr);
}
- /* Add the log as the first in the history list */
- flst_add_first(rseg_header + TRX_RSEG_HISTORY,
- undo_header + TRX_UNDO_HISTORY_NODE, mtr);
- mutex_enter(&kernel_mutex);
- trx_sys->rseg_history_len++;
- mutex_exit(&kernel_mutex);
-
- if (!(trx_sys->rseg_history_len % srv_purge_batch_size)) {
- /* Inform the purge thread that there is work to do. */
- srv_wake_purge_thread_if_not_active();
- }
+ flst_add_first(
+ rseg_header + TRX_RSEG_HISTORY,
+ undo_header + TRX_UNDO_HISTORY_NODE, mtr);
/* Write the trx number to the undo log header */
+
mlog_write_ull(undo_header + TRX_UNDO_TRX_NO, trx->no, mtr);
+
/* Write information about delete markings to the undo log header */
if (!undo->del_marks) {
- mlog_write_ulint(undo_header + TRX_UNDO_DEL_MARKS, FALSE,
- MLOG_2BYTES, mtr);
+ mlog_write_ulint(
+ undo_header + TRX_UNDO_DEL_MARKS, FALSE,
+ MLOG_2BYTES, mtr);
}
- if (rseg->last_page_no == FIL_NULL) {
+ if (undo->rseg->last_page_no == FIL_NULL) {
+ undo->rseg->last_trx_no = trx->no;
+ undo->rseg->last_offset = undo->hdr_offset;
+ undo->rseg->last_page_no = undo->hdr_page_no;
+ undo->rseg->last_del_marks = undo->del_marks;
- rseg->last_page_no = undo->hdr_page_no;
- rseg->last_offset = undo->hdr_offset;
- rseg->last_trx_no = trx->no;
- rseg->last_del_marks = undo->del_marks;
+ /* FIXME: Add a bin heap validate function to check that
+ the rseg exists. */
+ }
+
+ mutex_enter(&kernel_mutex);
+ trx_sys->rseg_history_len++;
+ mutex_exit(&kernel_mutex);
+
+ if (!(trx_sys->rseg_history_len % srv_purge_batch_size)) {
+ /* Inform the purge thread that there is work to do. */
+ srv_wake_purge_thread_if_not_active();
}
}
@@ -411,7 +423,6 @@ trx_purge_free_segment(
/* fputs("Freeing an update undo log segment\n", stderr); */
- ut_ad(mutex_own(&(purge_sys->mutex)));
loop:
mtr_start(&mtr);
mutex_enter(&(rseg->mutex));
@@ -515,8 +526,6 @@ trx_purge_truncate_rseg_history(
mtr_t mtr;
trx_id_t undo_trx_no;
- ut_ad(mutex_own(&(purge_sys->mutex)));
-
mtr_start(&mtr);
mutex_enter(&(rseg->mutex));
@@ -609,10 +618,8 @@ trx_purge_truncate_history(void)
trx_id_t limit_trx_no;
undo_no_t limit_undo_no;
- ut_ad(mutex_own(&(purge_sys->mutex)));
-
- trx_purge_arr_get_biggest(purge_sys->arr, &limit_trx_no,
- &limit_undo_no);
+ trx_purge_arr_get_biggest(
+ purge_sys->arr, &limit_trx_no, &limit_undo_no);
if (limit_trx_no == 0) {
@@ -630,34 +637,29 @@ trx_purge_truncate_history(void)
ut_ad(limit_trx_no <= purge_sys->view->low_limit_no);
- rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);
+ for (rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);
+ rseg != NULL;
+ rseg = UT_LIST_GET_NEXT(rseg_list, rseg)) {
- while (rseg) {
- trx_purge_truncate_rseg_history(rseg, limit_trx_no,
- limit_undo_no);
- rseg = UT_LIST_GET_NEXT(rseg_list, rseg);
+ trx_purge_truncate_rseg_history(
+ rseg, limit_trx_no, limit_undo_no);
}
}
/********************************************************************//**
Does a truncate if the purge array is empty. NOTE that when this function is
-called, the caller must not have any latches on undo log pages!
-@return TRUE if array empty */
+called, the caller must not have any latches on undo log pages! */
UNIV_INLINE
-ibool
+void
trx_purge_truncate_if_arr_empty(void)
/*=================================*/
{
- ut_ad(mutex_own(&(purge_sys->mutex)));
+ static ulint count;
- if (purge_sys->arr->n_used == 0) {
+ if (!(++count % TRX_SYS_N_RSEGS) && purge_sys->arr->n_used == 0) {
trx_purge_truncate_history();
-
- return(TRUE);
}
-
- return(FALSE);
}
/***********************************************************************//**
@@ -675,8 +677,8 @@ trx_purge_rseg_get_next_history_log(
trx_id_t trx_no;
ibool del_marks;
mtr_t mtr;
-
- ut_ad(mutex_own(&(purge_sys->mutex)));
+ rseg_queue_t rseg_queue;
+ const void* ptr;
mutex_enter(&(rseg->mutex));
@@ -688,8 +690,9 @@ trx_purge_rseg_get_next_history_log(
mtr_start(&mtr);
- undo_page = trx_undo_page_get_s_latched(rseg->space, rseg->zip_size,
- rseg->last_page_no, &mtr);
+ undo_page = trx_undo_page_get_s_latched(
+ rseg->space, rseg->zip_size, rseg->last_page_no, &mtr);
+
log_hdr = undo_page + rseg->last_offset;
/* Increase the purge page count by one for every handled log */
@@ -698,6 +701,7 @@ trx_purge_rseg_get_next_history_log(
prev_log_addr = trx_purge_get_log_from_hist(
flst_get_prev_addr(log_hdr + TRX_UNDO_HISTORY_NODE, &mtr));
+
if (prev_log_addr.page == FIL_NULL) {
/* No logs left in the history list */
@@ -712,11 +716,11 @@ trx_purge_rseg_get_next_history_log(
on the MySQL mailing list on Nov 9, 2004. The fut0lst.c
file-based list was corrupt. The prev node pointer was
FIL_NULL, even though the list length was over 8 million nodes!
- We assume that purge truncates the history list in moderate
+ We assume that purge truncates the history list in large
size pieces, and if we here reach the head of the list, the
- list cannot be longer than 20 000 undo logs now. */
+ list cannot be longer than 2000 000 undo logs now. */
- if (trx_sys->rseg_history_len > 20000) {
+ if (trx_sys->rseg_history_len > 2000000) {
ut_print_timestamp(stderr);
fprintf(stderr,
" InnoDB: Warning: purge reached the"
@@ -756,105 +760,150 @@ trx_purge_rseg_get_next_history_log(
rseg->last_trx_no = trx_no;
rseg->last_del_marks = del_marks;
+ rseg_queue.rseg = rseg;
+ rseg_queue.trx_no = rseg->last_trx_no;
+
+ /* Purge can also produce events, however these are already ordered
+ in the rollback segment and any user generated event will be greater
+ than the events that Purge produces. ie. Purge can never produce
+ events from an empty rollback segment. */
+
+ mutex_enter(&purge_sys->bh_mutex);
+
+ ptr = ib_bh_push(purge_sys->ib_bh, &rseg_queue);
+ ut_a(ptr != NULL);
+
+ mutex_exit(&purge_sys->bh_mutex);
+
mutex_exit(&(rseg->mutex));
}
/***********************************************************************//**
-Chooses the next undo log to purge and updates the info in purge_sys. This
-function is used to initialize purge_sys when the next record to purge is
-not known, and also to update the purge system info on the next record when
-purge has handled the whole undo log for a transaction. */
+Chooses the rollback segment with the smallest trx_id.
+@return zip_size if log is for a compressed table, ULINT_UNDEFINED if
+ no rollback segments to purge, 0 for non compressed tables. */
static
-void
-trx_purge_choose_next_log(void)
-/*===========================*/
+ulint
+trx_purge_get_rseg_with_min_trx_id(
+/*===============================*/
+ trx_purge_t* purge_sys) /*!< in/out: purge instance */
+
{
- trx_undo_rec_t* rec;
- trx_rseg_t* rseg;
- trx_rseg_t* min_rseg;
- trx_id_t min_trx_no;
- ulint space = 0; /* remove warning (??? bug ???) */
ulint zip_size = 0;
- ulint page_no = 0; /* remove warning (??? bug ???) */
- ulint offset = 0; /* remove warning (??? bug ???) */
- mtr_t mtr;
- ut_ad(mutex_own(&(purge_sys->mutex)));
- ut_ad(purge_sys->next_stored == FALSE);
+ mutex_enter(&purge_sys->bh_mutex);
- rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);
+ /* Only purge consumes events from the binary heap, user
+ threads only produce the events. */
- min_trx_no = IB_ULONGLONG_MAX;
+ if (!ib_bh_is_empty(purge_sys->ib_bh)) {
+ trx_rseg_t* rseg;
- min_rseg = NULL;
+ rseg = ((rseg_queue_t*) ib_bh_first(purge_sys->ib_bh))->rseg;
+ ib_bh_pop(purge_sys->ib_bh);
- while (rseg) {
- mutex_enter(&(rseg->mutex));
+ mutex_exit(&purge_sys->bh_mutex);
- if (rseg->last_page_no != FIL_NULL) {
+ purge_sys->rseg = rseg;
+ } else {
+ mutex_exit(&purge_sys->bh_mutex);
- if (min_rseg == NULL
- || min_trx_no > rseg->last_trx_no) {
+ purge_sys->rseg = NULL;
- min_rseg = rseg;
- min_trx_no = rseg->last_trx_no;
- space = rseg->space;
- zip_size = rseg->zip_size;
- ut_a(space == 0); /* We assume in purge of
- externally stored fields
- that space id == 0 */
- page_no = rseg->last_page_no;
- offset = rseg->last_offset;
- }
- }
+ return(ULINT_UNDEFINED);
+ }
- mutex_exit(&(rseg->mutex));
+ ut_a(purge_sys->rseg != NULL);
- rseg = UT_LIST_GET_NEXT(rseg_list, rseg);
- }
+ mutex_enter(&purge_sys->rseg->mutex);
- if (min_rseg == NULL) {
+ ut_a(purge_sys->rseg->last_page_no != FIL_NULL);
- return;
- }
+ /* We assume in purge of externally stored fields
+ that space id == 0 */
+ ut_a(purge_sys->rseg->space == 0);
- mtr_start(&mtr);
+ zip_size = purge_sys->rseg->zip_size;
- if (!min_rseg->last_del_marks) {
- /* No need to purge this log */
+ ut_a(purge_sys->purge_trx_no <= purge_sys->rseg->last_trx_no);
- rec = &trx_purge_dummy_rec;
- } else {
- rec = trx_undo_get_first_rec(space, zip_size, page_no, offset,
- RW_S_LATCH, &mtr);
- if (rec == NULL) {
- /* Undo log empty */
+ purge_sys->purge_trx_no = purge_sys->rseg->last_trx_no;
+
+ purge_sys->hdr_offset = purge_sys->rseg->last_offset;
+
+ purge_sys->hdr_page_no = purge_sys->rseg->last_page_no;
+
+ mutex_exit(&purge_sys->rseg->mutex);
+
+ return(zip_size);
+}
+
+/***********************************************************************//**
+Position the purge sys "iterator" on the undo record to use for purging. */
+static
+void
+trx_purge_read_undo_rec(
+/*====================*/
+ trx_purge_t* purge_sys, /*!< in/out: purge instance */
+ ulint zip_size) /*!< in: block size or 0 */
+{
+ ulint page_no;
+ ulint offset = 0;
+ ib_uint64_t undo_no = 0;
+
+ purge_sys->hdr_offset = purge_sys->rseg->last_offset;
+ page_no = purge_sys->hdr_page_no = purge_sys->rseg->last_page_no;
+
+ if (purge_sys->rseg->last_del_marks) {
+ mtr_t mtr;
+ trx_undo_rec_t* undo_rec;
- rec = &trx_purge_dummy_rec;
+ mtr_start(&mtr);
+
+ undo_rec = trx_undo_get_first_rec(
+ 0 /* System space id */, zip_size,
+ purge_sys->hdr_page_no,
+ purge_sys->hdr_offset, RW_S_LATCH, &mtr);
+
+ if (undo_rec != NULL) {
+ offset = page_offset(undo_rec);
+ undo_no = trx_undo_rec_get_undo_no(undo_rec);
+ page_no = page_get_page_no(page_align(undo_rec));
}
+
+ mtr_commit(&mtr);
}
+ purge_sys->offset = offset;
+ purge_sys->page_no = page_no;
+ purge_sys->purge_undo_no = undo_no;
+
purge_sys->next_stored = TRUE;
- purge_sys->rseg = min_rseg;
+}
- purge_sys->hdr_page_no = page_no;
- purge_sys->hdr_offset = offset;
+/***********************************************************************//**
+Chooses the next undo log to purge and updates the info in purge_sys. This
+function is used to initialize purge_sys when the next record to purge is
+not known, and also to update the purge system info on the next record when
+purge has handled the whole undo log for a transaction. */
+static
+void
+trx_purge_choose_next_log(void)
+/*===========================*/
+{
+ ulint zip_size;
- purge_sys->purge_trx_no = min_trx_no;
+ ut_ad(purge_sys->next_stored == FALSE);
- if (rec == &trx_purge_dummy_rec) {
+ zip_size = trx_purge_get_rseg_with_min_trx_id(purge_sys);
- purge_sys->purge_undo_no = 0;
- purge_sys->page_no = page_no;
- purge_sys->offset = 0;
- } else {
- purge_sys->purge_undo_no = trx_undo_rec_get_undo_no(rec);
+ if (purge_sys->rseg != NULL) {
- purge_sys->page_no = page_get_page_no(page_align(rec));
- purge_sys->offset = page_offset(rec);
+ trx_purge_read_undo_rec(purge_sys, zip_size);
+ } else {
+ /* There is nothing to do yet. */
+ os_thread_yield();
}
-
- mtr_commit(&mtr);
}
/***********************************************************************//**
@@ -880,7 +929,6 @@ trx_purge_get_next_rec(
ulint cmpl_info;
mtr_t mtr;
- ut_ad(mutex_own(&(purge_sys->mutex)));
ut_ad(purge_sys->next_stored);
space = purge_sys->rseg->space;
@@ -903,8 +951,8 @@ trx_purge_get_next_rec(
mtr_start(&mtr);
- undo_page = trx_undo_page_get_s_latched(space, zip_size,
- page_no, &mtr);
+ undo_page = trx_undo_page_get_s_latched(space, zip_size, page_no, &mtr);
+
rec = undo_page + offset;
rec2 = rec;
@@ -913,9 +961,9 @@ trx_purge_get_next_rec(
/* Try first to find the next record which requires a purge
operation from the same page of the same undo log */
- next_rec = trx_undo_page_get_next_rec(rec2,
- purge_sys->hdr_page_no,
- purge_sys->hdr_offset);
+ next_rec = trx_undo_page_get_next_rec(
+ rec2, purge_sys->hdr_page_no, purge_sys->hdr_offset);
+
if (next_rec == NULL) {
rec2 = trx_undo_get_next_rec(
rec2, purge_sys->hdr_page_no,
@@ -995,17 +1043,12 @@ trx_purge_fetch_next_rec(
{
trx_undo_rec_t* undo_rec;
- mutex_enter(&(purge_sys->mutex));
if (purge_sys->state == TRX_STOP_PURGE) {
trx_purge_truncate_if_arr_empty();
- mutex_exit(&(purge_sys->mutex));
-
return(NULL);
- }
-
- if (!purge_sys->next_stored) {
+ } else if (!purge_sys->next_stored) {
trx_purge_choose_next_log();
if (!purge_sys->next_stored) {
@@ -1020,8 +1063,6 @@ trx_purge_fetch_next_rec(
(ulong) purge_sys->n_pages_handled);
}
- mutex_exit(&(purge_sys->mutex));
-
return(NULL);
}
}
@@ -1032,18 +1073,12 @@ trx_purge_fetch_next_rec(
trx_purge_truncate_if_arr_empty();
- mutex_exit(&(purge_sys->mutex));
-
return(NULL);
- }
-
- if (purge_sys->purge_trx_no >= purge_sys->view->low_limit_no) {
+ } else if (purge_sys->purge_trx_no >= purge_sys->view->low_limit_no) {
purge_sys->state = TRX_STOP_PURGE;
trx_purge_truncate_if_arr_empty();
- mutex_exit(&(purge_sys->mutex));
-
return(NULL);
}
@@ -1052,12 +1087,13 @@ trx_purge_fetch_next_rec(
(ullint) purge_sys->purge_trx_no,
(ullint) purge_sys->purge_undo_no); */
- *roll_ptr = trx_undo_build_roll_ptr(FALSE, (purge_sys->rseg)->id,
- purge_sys->page_no,
- purge_sys->offset);
- *cell = trx_purge_arr_store_info(purge_sys->purge_trx_no,
- purge_sys->purge_undo_no);
+ *roll_ptr = trx_undo_build_roll_ptr(
+ FALSE, (purge_sys->rseg)->id, purge_sys->page_no,
+ purge_sys->offset);
+
+ *cell = trx_purge_arr_store_info(
+ purge_sys->purge_trx_no, purge_sys->purge_undo_no);
ut_ad(purge_sys->purge_trx_no < purge_sys->view->low_limit_no);
@@ -1066,8 +1102,6 @@ trx_purge_fetch_next_rec(
undo_rec = trx_purge_get_next_rec(heap);
- mutex_exit(&(purge_sys->mutex));
-
return(undo_rec);
}
@@ -1079,11 +1113,7 @@ trx_purge_rec_release(
/*==================*/
trx_undo_inf_t* cell) /*!< in: storage cell */
{
- mutex_enter(&(purge_sys->mutex));
-
trx_purge_arr_remove_info(cell);
-
- mutex_exit(&(purge_sys->mutex));
}
/*******************************************************************//**
@@ -1097,23 +1127,11 @@ trx_purge(
purge in one batch */
{
que_thr_t* thr;
- /* que_thr_t* thr2; */
ulint old_pages_handled;
- mutex_enter(&(purge_sys->mutex));
-
- if (purge_sys->trx->n_active_thrs > 0) {
+ ut_a(purge_sys->trx->n_active_thrs == 0);
- mutex_exit(&(purge_sys->mutex));
-
- /* Should not happen */
-
- ut_error;
-
- return(0);
- }
-
- rw_lock_x_lock(&(purge_sys->latch));
+ rw_lock_x_lock(&purge_sys->latch);
mutex_enter(&kernel_mutex);
@@ -1147,8 +1165,9 @@ trx_purge(
}
}
- purge_sys->view = read_view_oldest_copy_or_open_new(0,
- purge_sys->heap);
+ purge_sys->view = read_view_oldest_copy_or_open_new(
+ 0, purge_sys->heap);
+
mutex_exit(&kernel_mutex);
rw_lock_x_unlock(&(purge_sys->latch));
@@ -1159,7 +1178,6 @@ trx_purge(
old_pages_handled = purge_sys->n_pages_handled;
- mutex_exit(&(purge_sys->mutex));
mutex_enter(&kernel_mutex);
@@ -1167,15 +1185,8 @@ trx_purge(
ut_ad(thr);
- /* thr2 = que_fork_start_command(purge_sys->query);
-
- ut_ad(thr2); */
-
-
mutex_exit(&kernel_mutex);
- /* srv_que_task_enqueue(thr2); */
-
if (srv_print_thread_releases) {
fputs("Starting purge\n", stderr);
diff --git a/storage/innobase/trx/trx0rseg.c b/storage/innobase/trx/trx0rseg.c
index 740320f68c1..85beac8afbc 100644
--- a/storage/innobase/trx/trx0rseg.c
+++ b/storage/innobase/trx/trx0rseg.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2011, Oracle Corpn. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -50,11 +50,11 @@ trx_rseg_get_on_id(
{
trx_rseg_t* rseg;
- rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);
+ ut_a(id < TRX_SYS_N_RSEGS);
- while (rseg && rseg->id != id) {
- rseg = UT_LIST_GET_NEXT(rseg_list, rseg);
- }
+ rseg = trx_sys->rseg_array[id];
+
+ ut_a(rseg == NULL || id == rseg->id);
return(rseg);
}
@@ -181,12 +181,15 @@ static
trx_rseg_t*
trx_rseg_mem_create(
/*================*/
- ulint id, /*!< in: rollback segment id */
- ulint space, /*!< in: space where the segment placed */
- ulint zip_size, /*!< in: compressed page size in bytes
- or 0 for uncompressed pages */
- ulint page_no, /*!< in: page number of the segment header */
- mtr_t* mtr) /*!< in: mtr */
+ ulint id, /*!< in: rollback segment id */
+ ulint space, /*!< in: space where the segment
+ placed */
+ ulint zip_size, /*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ ulint page_no, /*!< in: page number of the segment
+ header */
+ ib_bh_t* ib_bh, /*!< in/out: rseg queue */
+ mtr_t* mtr) /*!< in: mtr */
{
ulint len;
trx_rseg_t* rseg;
@@ -225,6 +228,9 @@ trx_rseg_mem_create(
len = flst_get_len(rseg_header + TRX_RSEG_HISTORY, mtr);
if (len > 0) {
+ const void* ptr;
+ rseg_queue_t rseg_queue;
+
trx_sys->rseg_history_len += len;
node_addr = trx_purge_get_log_from_hist(
@@ -240,6 +246,17 @@ trx_rseg_mem_create(
undo_log_hdr + TRX_UNDO_TRX_NO);
rseg->last_del_marks = mtr_read_ulint(
undo_log_hdr + TRX_UNDO_DEL_MARKS, MLOG_2BYTES, mtr);
+
+ rseg_queue.rseg = rseg;
+ rseg_queue.trx_no = rseg->last_trx_no;
+
+ if (rseg->last_page_no != FIL_NULL) {
+ /* There is no need to cover this operation by the purge
+ mutex because we are still bootstrapping. */
+
+ ptr = ib_bh_push(ib_bh, &rseg_queue);
+ ut_a(ptr != NULL);
+ }
} else {
rseg->last_page_no = FIL_NULL;
}
@@ -255,6 +272,7 @@ void
trx_rseg_create_instance(
/*=====================*/
trx_sysf_t* sys_header, /*!< in: trx system header */
+ ib_bh_t* ib_bh, /*!< in/out: rseg queue */
mtr_t* mtr) /*!< in: mtr */
{
ulint i;
@@ -278,7 +296,7 @@ trx_rseg_create_instance(
zip_size = space ? fil_space_get_zip_size(space) : 0;
rseg = trx_rseg_mem_create(
- i, space, zip_size, page_no, mtr);
+ i, space, zip_size, page_no, ib_bh, mtr);
ut_a(rseg->id == i);
}
@@ -327,7 +345,8 @@ trx_rseg_create(void)
zip_size = space ? fil_space_get_zip_size(space) : 0;
rseg = trx_rseg_mem_create(
- slot_no, space, zip_size, page_no, &mtr);
+ slot_no, space, zip_size, page_no,
+ purge_sys->ib_bh, &mtr);
}
mutex_exit(&kernel_mutex);
@@ -342,13 +361,14 @@ UNIV_INTERN
void
trx_rseg_list_and_array_init(
/*=========================*/
- trx_sysf_t* sys_header, /* in: trx system header */
- mtr_t* mtr) /* in: mtr */
+ trx_sysf_t* sys_header, /*!< in: trx system header */
+ ib_bh_t* ib_bh, /*!< in: rseg queue */
+ mtr_t* mtr) /*!< in: mtr */
{
UT_LIST_INIT(trx_sys->rseg_list);
trx_sys->rseg_history_len = 0;
- trx_rseg_create_instance(sys_header, mtr);
+ trx_rseg_create_instance(sys_header, ib_bh, mtr);
}
diff --git a/storage/innobase/trx/trx0sys.c b/storage/innobase/trx/trx0sys.c
index 101f225a06f..7af9fbb1af8 100644
--- a/storage/innobase/trx/trx0sys.c
+++ b/storage/innobase/trx/trx0sys.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2011, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -946,6 +946,31 @@ trx_sysf_create(
}
/*****************************************************************//**
+Compare two trx_rseg_t instances on last_trx_no. */
+static
+int
+trx_rseg_compare_last_trx_no(
+/*=========================*/
+ const void* p1, /*!< in: elem to compare */
+ const void* p2) /*!< in: elem to compare */
+{
+ ib_int64_t cmp;
+
+ const rseg_queue_t* rseg_q1 = (const rseg_queue_t*) p1;
+ const rseg_queue_t* rseg_q2 = (const rseg_queue_t*) p2;
+
+ cmp = rseg_q1->trx_no - rseg_q2->trx_no;
+
+ if (cmp < 0) {
+ return(-1);
+ } else if (cmp > 0) {
+ return(1);
+ }
+
+ return(0);
+}
+
+/*****************************************************************//**
Creates and initializes the central memory structures for the transaction
system. This is called when the database is started. */
UNIV_INTERN
@@ -958,6 +983,7 @@ trx_sys_init_at_db_start(void)
const char* unit = "";
trx_t* trx;
mtr_t mtr;
+ ib_bh_t* ib_bh;
mtr_start(&mtr);
@@ -965,11 +991,19 @@ trx_sys_init_at_db_start(void)
mutex_enter(&kernel_mutex);
- trx_sys = mem_alloc(sizeof(trx_sys_t));
+ /* We create the min binary heap here and pass ownership to
+ purge when we init the purge sub-system. Purge is responsible
+ for freeing the binary heap. */
+
+ ib_bh = ib_bh_create(
+ trx_rseg_compare_last_trx_no,
+ sizeof(rseg_queue_t), TRX_SYS_N_RSEGS);
+
+ trx_sys = mem_zalloc(sizeof(*trx_sys));
sys_header = trx_sysf_get(&mtr);
- trx_rseg_list_and_array_init(sys_header, &mtr);
+ trx_rseg_list_and_array_init(sys_header, ib_bh, &mtr);
trx_sys->latest_rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);
@@ -1023,7 +1057,8 @@ trx_sys_init_at_db_start(void)
UT_LIST_INIT(trx_sys->view_list);
- trx_purge_sys_create();
+ /* Transfer ownership to purge. */
+ trx_purge_sys_create(ib_bh);
mutex_exit(&kernel_mutex);
diff --git a/storage/innobase/trx/trx0trx.c b/storage/innobase/trx/trx0trx.c
index 4bbcee210b0..820c40fe857 100644
--- a/storage/innobase/trx/trx0trx.c
+++ b/storage/innobase/trx/trx0trx.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2011, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -42,6 +42,7 @@ Created 3/26/1996 Heikki Tuuri
#include "btr0sea.h"
#include "os0proc.h"
#include "trx0xa.h"
+#include "trx0purge.h"
#include "ha_prototypes.h"
/** Dummy session used currently in MySQL interface */
@@ -604,36 +605,26 @@ trx_lists_init_at_db_start(void)
/******************************************************************//**
Assigns a rollback segment to a transaction in a round-robin fashion.
-Skips the SYSTEM rollback segment if another is available.
-@return assigned rollback segment id */
+@return assigned rollback segment instance */
UNIV_INLINE
-ulint
-trx_assign_rseg(void)
-/*=================*/
+trx_rseg_t*
+trx_assign_rseg(
+/*============*/
+ ulint max_undo_logs) /*!< in: maximum number of UNDO logs to use */
{
- trx_rseg_t* rseg = trx_sys->latest_rseg;
+ trx_rseg_t* rseg = trx_sys->latest_rseg;
ut_ad(mutex_own(&kernel_mutex));
-loop:
- /* Get next rseg in a round-robin fashion */
rseg = UT_LIST_GET_NEXT(rseg_list, rseg);
- if (rseg == NULL) {
+ if (rseg == NULL || rseg->id == max_undo_logs - 1) {
rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);
}
- /* If it is the SYSTEM rollback segment, and there exist others, skip
- it */
-
- if ((rseg->id == TRX_SYS_SYSTEM_RSEG_ID)
- && (UT_LIST_GET_LEN(trx_sys->rseg_list) > 1)) {
- goto loop;
- }
-
trx_sys->latest_rseg = rseg;
- return(rseg->id);
+ return(rseg);
}
/****************************************************************//**
@@ -663,12 +654,9 @@ trx_start_low(
ut_ad(trx->conc_state != TRX_ACTIVE);
- if (rseg_id == ULINT_UNDEFINED) {
-
- rseg_id = trx_assign_rseg();
- }
+ ut_a(rseg_id == ULINT_UNDEFINED);
- rseg = trx_sys_get_nth_rseg(trx_sys, rseg_id);
+ rseg = trx_assign_rseg(srv_rollback_segments);
trx->id = trx_sys_get_new_trx_id();
@@ -719,107 +707,179 @@ trx_start(
}
/****************************************************************//**
-Commits a transaction. */
-UNIV_INTERN
+Set the transaction serialisation number. */
+static
void
-trx_commit_off_kernel(
-/*==================*/
- trx_t* trx) /*!< in: transaction */
+trx_serialisation_number_get(
+/*=========================*/
+ trx_t* trx) /*!< in: transaction */
{
- page_t* update_hdr_page;
- ib_uint64_t lsn = 0;
trx_rseg_t* rseg;
- trx_undo_t* undo;
- mtr_t mtr;
- ut_ad(mutex_own(&kernel_mutex));
+ rseg = trx->rseg;
- trx->must_flush_log_later = FALSE;
+ ut_ad(mutex_own(&rseg->mutex));
- rseg = trx->rseg;
+ mutex_enter(&kernel_mutex);
- if (trx->insert_undo != NULL || trx->update_undo != NULL) {
+ trx->no = trx_sys_get_new_trx_id();
+
+ /* If the rollack segment is not empty then the
+ new trx_t::no can't be less than any trx_t::no
+ already in the rollback segment. User threads only
+ produce events when a rollback segment is empty. */
+
+ if (rseg->last_page_no == FIL_NULL) {
+ void* ptr;
+ rseg_queue_t rseg_queue;
+
+ rseg_queue.rseg = rseg;
+ rseg_queue.trx_no = trx->no;
+
+ mutex_enter(&purge_sys->bh_mutex);
+
+ /* This is to reduce the pressure on the kernel mutex,
+ though in reality it should make very little (read no)
+ difference because this code path is only taken when the
+ rbs is empty. */
mutex_exit(&kernel_mutex);
- mtr_start(&mtr);
+ ptr = ib_bh_push(purge_sys->ib_bh, &rseg_queue);
+ ut_a(ptr);
- /* Change the undo log segment states from TRX_UNDO_ACTIVE
- to some other state: these modifications to the file data
- structure define the transaction as committed in the file
- based world, at the serialization point of the log sequence
- number lsn obtained below. */
+ mutex_exit(&purge_sys->bh_mutex);
+ } else {
+ mutex_exit(&kernel_mutex);
+ }
+}
- mutex_enter(&(rseg->mutex));
+/****************************************************************//**
+Assign the transaction its history serialisation number and write the
+update UNDO log record to the assigned rollback segment.
+@return the LSN of the UNDO log write. */
+static
+ib_uint64_t
+trx_write_serialisation_history(
+/*============================*/
+ trx_t* trx) /*!< in: transaction */
+{
+ mtr_t mtr;
+ trx_rseg_t* rseg;
- if (trx->insert_undo != NULL) {
- trx_undo_set_state_at_finish(trx->insert_undo, &mtr);
- }
+ ut_ad(!mutex_own(&kernel_mutex));
- undo = trx->update_undo;
+ rseg = trx->rseg;
- if (undo) {
- mutex_enter(&kernel_mutex);
- trx->no = trx_sys_get_new_trx_no();
- mutex_exit(&kernel_mutex);
+ mtr_start(&mtr);
- /* It is not necessary to obtain trx->undo_mutex here
- because only a single OS thread is allowed to do the
- transaction commit for this transaction. */
+ /* Change the undo log segment states from TRX_UNDO_ACTIVE
+ to some other state: these modifications to the file data
+ structure define the transaction as committed in the file
+ based domain, at the serialization point of the log sequence
+ number lsn obtained below. */
- update_hdr_page = trx_undo_set_state_at_finish(
- undo, &mtr);
+ if (trx->update_undo != NULL) {
+ page_t* undo_hdr_page;
+ trx_undo_t* undo = trx->update_undo;
- /* We have to do the cleanup for the update log while
- holding the rseg mutex because update log headers
- have to be put to the history list in the order of
- the trx number. */
+ /* We have to hold the rseg mutex because update
+ log headers have to be put to the history list in the
+ (serialisation) order of the UNDO trx number. This is
+ required for the purge in-memory data structures too. */
- trx_undo_update_cleanup(trx, update_hdr_page, &mtr);
- }
+ mutex_enter(&rseg->mutex);
- mutex_exit(&(rseg->mutex));
+ /* Assign the transaction serialisation number and also
+ update the purge min binary heap if this is the first
+ UNDO log being written to the assigned rollback segment. */
- /* Update the latest MySQL binlog name and offset info
- in trx sys header if MySQL binlogging is on or the database
- server is a MySQL replication slave */
-
- if (trx->mysql_log_file_name
- && trx->mysql_log_file_name[0] != '\0') {
- trx_sys_update_mysql_binlog_offset(
- trx->mysql_log_file_name,
- trx->mysql_log_offset,
- TRX_SYS_MYSQL_LOG_INFO, &mtr);
- trx->mysql_log_file_name = NULL;
- }
+ trx_serialisation_number_get(trx);
- /* The following call commits the mini-transaction, making the
- whole transaction committed in the file-based world, at this
- log sequence number. The transaction becomes 'durable' when
- we write the log to disk, but in the logical sense the commit
- in the file-based data structures (undo logs etc.) happens
- here.
-
- NOTE that transaction numbers, which are assigned only to
- transactions with an update undo log, do not necessarily come
- in exactly the same order as commit lsn's, if the transactions
- have different rollback segments. To get exactly the same
- order we should hold the kernel mutex up to this point,
- adding to the contention of the kernel mutex. However, if
- a transaction T2 is able to see modifications made by
- a transaction T1, T2 will always get a bigger transaction
- number and a bigger commit lsn than T1. */
+ /* It is not necessary to obtain trx->undo_mutex here
+ because only a single OS thread is allowed to do the
+ transaction commit for this transaction. */
- /*--------------*/
- mtr_commit(&mtr);
- /*--------------*/
- lsn = mtr.end_lsn;
+ undo_hdr_page = trx_undo_set_state_at_finish(undo, &mtr);
+
+ trx_undo_update_cleanup(trx, undo_hdr_page, &mtr);
+ } else {
+ mutex_enter(&rseg->mutex);
+ }
+
+ if (trx->insert_undo != NULL) {
+ trx_undo_set_state_at_finish(trx->insert_undo, &mtr);
+ }
+
+ mutex_exit(&rseg->mutex);
+
+ /* Update the latest MySQL binlog name and offset info
+ in trx sys header if MySQL binlogging is on or the database
+ server is a MySQL replication slave */
+
+ if (trx->mysql_log_file_name
+ && trx->mysql_log_file_name[0] != '\0') {
+
+ trx_sys_update_mysql_binlog_offset(
+ trx->mysql_log_file_name,
+ trx->mysql_log_offset,
+ TRX_SYS_MYSQL_LOG_INFO, &mtr);
+
+ trx->mysql_log_file_name = NULL;
+ }
+
+ /* The following call commits the mini-transaction, making the
+ whole transaction committed in the file-based world, at this
+ log sequence number. The transaction becomes 'durable' when
+ we write the log to disk, but in the logical sense the commit
+ in the file-based data structures (undo logs etc.) happens
+ here.
+
+ NOTE that transaction numbers, which are assigned only to
+ transactions with an update undo log, do not necessarily come
+ in exactly the same order as commit lsn's, if the transactions
+ have different rollback segments. To get exactly the same
+ order we should hold the kernel mutex up to this point,
+ adding to the contention of the kernel mutex. However, if
+ a transaction T2 is able to see modifications made by
+ a transaction T1, T2 will always get a bigger transaction
+ number and a bigger commit lsn than T1. */
+
+ /*--------------*/
+ mtr_commit(&mtr);
+ /*--------------*/
+
+ return(mtr.end_lsn);
+}
+
+/****************************************************************//**
+Commits a transaction. */
+UNIV_INTERN
+void
+trx_commit_off_kernel(
+/*==================*/
+ trx_t* trx) /*!< in: transaction */
+{
+ ib_uint64_t lsn;
+
+ ut_ad(mutex_own(&kernel_mutex));
+
+ trx->must_flush_log_later = FALSE;
+
+ /* If the transaction made any updates then we need to write the
+ UNDO logs for the updates to the assigned rollback segment. */
+
+ if (trx->insert_undo != NULL || trx->update_undo != NULL) {
+ mutex_exit(&kernel_mutex);
+
+ lsn = trx_write_serialisation_history(trx);
mutex_enter(&kernel_mutex);
+ } else {
+ lsn = 0;
}
- ut_ad(trx->conc_state == TRX_ACTIVE
- || trx->conc_state == TRX_PREPARED);
+ ut_ad(trx->conc_state == TRX_ACTIVE || trx->conc_state == TRX_PREPARED);
ut_ad(mutex_own(&kernel_mutex));
/* The following assignment makes the transaction committed in memory
diff --git a/storage/innobase/ut/ut0bh.c b/storage/innobase/ut/ut0bh.c
new file mode 100644
index 00000000000..ae0b1aff207
--- /dev/null
+++ b/storage/innobase/ut/ut0bh.c
@@ -0,0 +1,164 @@
+/***************************************************************************//**
+Copyright (c) 2010, 2011, Oracle Corpn. All Rights Reserved.
+
+Portions of this file contain modifications contributed and copyrighted by
+Sun Microsystems, Inc. Those modifications are gratefully acknowledged and
+are described briefly in the InnoDB documentation. The contributions by
+Sun Microsystems are incorporated with their permission, and subject to the
+conditions contained in the file COPYING.Sun_Microsystems.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file ut/ut0bh.c
+Binary min-heap implementation.
+
+Created 2010-05-28 by Sunny Bains
+*******************************************************/
+
+#include "ut0bh.h"
+#include "ut0mem.h"
+
+#ifdef UNIV_NONINL
+#include "ut0bh.ic"
+#endif
+
+#include <string.h>
+
+/**********************************************************************//**
+Create a binary heap.
+@return a new binary heap */
+UNIV_INTERN
+ib_bh_t*
+ib_bh_create(
+/*=========*/
+ ib_bh_cmp_t compare, /*!< in: comparator */
+ ulint sizeof_elem, /*!< in: size of one element */
+ ulint max_elems) /*!< in: max elements allowed */
+{
+ ulint sz;
+ ib_bh_t* ib_bh;
+
+ sz = sizeof(*ib_bh) + (sizeof_elem * max_elems);
+
+ ib_bh = (ib_bh_t*) ut_malloc(sz);
+ memset(ib_bh, 0x0, sz);
+
+ ib_bh->compare = compare;
+ ib_bh->max_elems = max_elems;
+ ib_bh->sizeof_elem = sizeof_elem;
+
+ return(ib_bh);
+}
+
+/**********************************************************************//**
+Free a binary heap.
+@return a new binary heap */
+UNIV_INTERN
+void
+ib_bh_free(
+/*=======*/
+ ib_bh_t* ib_bh) /*!< in/own: instance */
+{
+ ut_free(ib_bh);
+}
+
+/**********************************************************************//**
+Add an element to the binary heap. Note: The element is copied.
+@return pointer to added element or NULL if full. */
+UNIV_INTERN
+void*
+ib_bh_push(
+/*=======*/
+ ib_bh_t* ib_bh, /*!< in/out: instance */
+ const void* elem) /*!< in: element to add */
+{
+ void* ptr;
+
+ if (ib_bh_is_full(ib_bh)) {
+ return(NULL);
+ } else if (ib_bh_is_empty(ib_bh)) {
+ ++ib_bh->n_elems;
+ return(ib_bh_set(ib_bh, 0, elem));
+ } else {
+ ulint i;
+
+ i = ib_bh->n_elems;
+
+ ++ib_bh->n_elems;
+
+ for (ptr = ib_bh_get(ib_bh, i >> 1);
+ i > 0 && ib_bh->compare(ptr, elem) > 0;
+ i >>= 1, ptr = ib_bh_get(ib_bh, i >> 1)) {
+
+ ib_bh_set(ib_bh, i, ptr);
+ }
+
+ ptr = ib_bh_set(ib_bh, i, elem);
+ }
+
+ return(ptr);
+}
+
+/**********************************************************************//**
+Remove the first element from the binary heap. */
+UNIV_INTERN
+void
+ib_bh_pop(
+/*======*/
+ ib_bh_t* ib_bh) /*!< in/out: instance */
+{
+ byte* ptr;
+ byte* last;
+ ulint parent = 0;
+
+ if (ib_bh_is_empty(ib_bh)) {
+ return;
+ } else if (ib_bh_size(ib_bh) == 1) {
+ --ib_bh->n_elems;
+ return;
+ }
+
+ last = (byte*) ib_bh_last(ib_bh);
+
+ /* Start from the child node */
+ ptr = (byte*) ib_bh_get(ib_bh, 1);
+
+ while (ptr < last) {
+ /* If the "right" child node is < "left" child node */
+ if (ib_bh->compare(ptr + ib_bh->sizeof_elem, ptr) < 0) {
+ ptr += ib_bh->sizeof_elem;
+ }
+
+ if (ib_bh->compare(last, ptr) <= 0) {
+ break;
+ }
+
+ ib_bh_set(ib_bh, parent, ptr);
+
+ parent = (ptr - (byte*) ib_bh_first(ib_bh))
+ / ib_bh->sizeof_elem;
+
+ if ((parent << 1) >= ib_bh_size(ib_bh)) {
+ break;
+ }
+
+ ptr = (byte*) ib_bh_get(ib_bh, parent << 1);
+ }
+
+ --ib_bh->n_elems;
+
+ ib_bh_set(ib_bh, parent, last);
+}
diff --git a/storage/innobase/ut/ut0ut.c b/storage/innobase/ut/ut0ut.c
index 4f4dfc5eed8..cd0894b132a 100644
--- a/storage/innobase/ut/ut0ut.c
+++ b/storage/innobase/ut/ut0ut.c
@@ -1,13 +1,6 @@
/*****************************************************************************
-Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
-Copyright (c) 2009, Sun Microsystems, Inc.
-
-Portions of this file contain modifications contributed and copyrighted by
-Sun Microsystems, Inc. Those modifications are gratefully acknowledged and
-are described briefly in the InnoDB documentation. The contributions by
-Sun Microsystems are incorporated with their permission, and subject to the
-conditions contained in the file COPYING.Sun_Microsystems.
+Copyright (c) 2011, Oracle Corpn. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
diff --git a/storage/myisam/mi_create.c b/storage/myisam/mi_create.c
index 46c61eb4709..ea7382300cc 100644
--- a/storage/myisam/mi_create.c
+++ b/storage/myisam/mi_create.c
@@ -269,7 +269,7 @@ int mi_create(const char *name,uint keys,MI_KEYDEF *keydefs,
keyseg->type != HA_KEYTYPE_VARBINARY2)
{
my_errno=HA_WRONG_CREATE_OPTION;
- goto err;
+ goto err_no_lock;
}
}
keydef->keysegs+=sp_segs;
@@ -278,7 +278,7 @@ int mi_create(const char *name,uint keys,MI_KEYDEF *keydefs,
min_key_length_skip+=SPLEN*2*SPDIMS;
#else
my_errno= HA_ERR_UNSUPPORTED;
- goto err;
+ goto err_no_lock;
#endif /*HAVE_SPATIAL*/
}
else if (keydef->flag & HA_FULLTEXT)
@@ -294,7 +294,7 @@ int mi_create(const char *name,uint keys,MI_KEYDEF *keydefs,
keyseg->type != HA_KEYTYPE_VARTEXT2)
{
my_errno=HA_WRONG_CREATE_OPTION;
- goto err;
+ goto err_no_lock;
}
if (!(keyseg->flag & HA_BLOB_PART) &&
(keyseg->type == HA_KEYTYPE_VARTEXT1 ||
@@ -419,7 +419,7 @@ int mi_create(const char *name,uint keys,MI_KEYDEF *keydefs,
if (keydef->keysegs > MI_MAX_KEY_SEG)
{
my_errno=HA_WRONG_CREATE_OPTION;
- goto err;
+ goto err_no_lock;
}
/*
key_segs may be 0 in the case when we only want to be able to
@@ -444,7 +444,7 @@ int mi_create(const char *name,uint keys,MI_KEYDEF *keydefs,
length >= MI_MAX_KEY_BUFF)
{
my_errno=HA_WRONG_CREATE_OPTION;
- goto err;
+ goto err_no_lock;
}
set_if_bigger(max_key_block_length,keydef->block_length);
keydef->keylength= (uint16) key_length;
@@ -491,7 +491,7 @@ int mi_create(const char *name,uint keys,MI_KEYDEF *keydefs,
"indexes and/or unique constraints.",
MYF(0), name + dirname_length(name));
my_errno= HA_WRONG_CREATE_OPTION;
- goto err;
+ goto err_no_lock;
}
bmove(share.state.header.file_version,(uchar*) myisam_file_magic,4);
@@ -810,12 +810,14 @@ int mi_create(const char *name,uint keys,MI_KEYDEF *keydefs,
errpos=0;
mysql_mutex_unlock(&THR_LOCK_myisam);
if (mysql_file_close(file, MYF(0)))
- goto err;
+ goto err_no_lock;
my_free(rec_per_key_part);
DBUG_RETURN(0);
err:
mysql_mutex_unlock(&THR_LOCK_myisam);
+
+err_no_lock:
save_errno=my_errno;
switch (errpos) {
case 3: