summaryrefslogtreecommitdiff
path: root/storage/xtradb
diff options
context:
space:
mode:
authorMichael Widenius <monty@askmonty.org>2011-12-11 11:34:44 +0200
committerMichael Widenius <monty@askmonty.org>2011-12-11 11:34:44 +0200
commit6d4224a31c9d32c8f8067a4f7d16daa29bcdee6b (patch)
tree79e3143528495069ad232f673532573b30afe425 /storage/xtradb
parent3e2cb35e11cb5ee6668d538a62a3b32e017944a5 (diff)
parent701c0f822abe4ee9eeafd244fa30dc2fcf067b81 (diff)
downloadmariadb-git-6d4224a31c9d32c8f8067a4f7d16daa29bcdee6b.tar.gz
Merge with 5.2.
no_error handling for select (used by INSERT ... SELECT) still needs to be fixed, but I will do that in a separate commit
Diffstat (limited to 'storage/xtradb')
-rw-r--r--storage/xtradb/ChangeLog107
-rw-r--r--storage/xtradb/btr/btr0btr.c629
-rw-r--r--storage/xtradb/btr/btr0cur.c459
-rw-r--r--storage/xtradb/btr/btr0pcur.c29
-rw-r--r--storage/xtradb/btr/btr0sea.c4
-rw-r--r--storage/xtradb/buf/buf0buddy.c479
-rw-r--r--storage/xtradb/buf/buf0buf.c1054
-rw-r--r--storage/xtradb/buf/buf0flu.c11
-rw-r--r--storage/xtradb/buf/buf0lru.c350
-rw-r--r--storage/xtradb/buf/buf0rea.c179
-rw-r--r--storage/xtradb/dict/dict0dict.c7
-rw-r--r--storage/xtradb/dict/dict0load.c63
-rw-r--r--storage/xtradb/dict/dict0mem.c9
-rw-r--r--storage/xtradb/fil/fil0fil.c558
-rw-r--r--storage/xtradb/ha/hash0hash.c64
-rw-r--r--storage/xtradb/handler/ha_innodb.cc324
-rw-r--r--storage/xtradb/handler/ha_innodb.h11
-rw-r--r--storage/xtradb/handler/handler0alter.cc71
-rw-r--r--storage/xtradb/handler/i_s.cc430
-rw-r--r--storage/xtradb/handler/innodb_patch_info.h1
-rw-r--r--storage/xtradb/ibuf/ibuf0ibuf.c2
-rw-r--r--storage/xtradb/include/btr0btr.h170
-rw-r--r--storage/xtradb/include/btr0cur.h31
-rw-r--r--storage/xtradb/include/btr0cur.ic4
-rw-r--r--storage/xtradb/include/btr0pcur.h21
-rw-r--r--storage/xtradb/include/btr0pcur.ic9
-rw-r--r--storage/xtradb/include/btr0types.h125
-rw-r--r--storage/xtradb/include/buf0buddy.h21
-rw-r--r--storage/xtradb/include/buf0buddy.ic37
-rw-r--r--storage/xtradb/include/buf0buf.h81
-rw-r--r--storage/xtradb/include/buf0buf.ic74
-rw-r--r--storage/xtradb/include/buf0lru.h23
-rw-r--r--storage/xtradb/include/buf0types.h17
-rw-r--r--storage/xtradb/include/dict0mem.h7
-rw-r--r--storage/xtradb/include/fil0fil.h3
-rw-r--r--storage/xtradb/include/hash0hash.h49
-rw-r--r--storage/xtradb/include/mtr0mtr.h12
-rw-r--r--storage/xtradb/include/os0file.h3
-rw-r--r--storage/xtradb/include/os0proc.h28
-rw-r--r--storage/xtradb/include/page0cur.ic5
-rw-r--r--storage/xtradb/include/page0page.h39
-rw-r--r--storage/xtradb/include/page0page.ic32
-rw-r--r--storage/xtradb/include/page0zip.h2
-rw-r--r--storage/xtradb/include/rem0rec.h14
-rw-r--r--storage/xtradb/include/rem0rec.ic41
-rw-r--r--storage/xtradb/include/row0row.h26
-rw-r--r--storage/xtradb/include/row0row.ic45
-rw-r--r--storage/xtradb/include/row0upd.ic4
-rw-r--r--storage/xtradb/include/srv0srv.h10
-rw-r--r--storage/xtradb/include/sync0arr.h7
-rw-r--r--storage/xtradb/include/sync0rw.ic10
-rw-r--r--storage/xtradb/include/sync0sync.h6
-rw-r--r--storage/xtradb/include/trx0sys.h13
-rw-r--r--storage/xtradb/include/trx0sys.ic24
-rw-r--r--storage/xtradb/include/trx0trx.h14
-rw-r--r--storage/xtradb/include/trx0undo.h9
-rw-r--r--storage/xtradb/include/univ.i9
-rw-r--r--storage/xtradb/include/ut0lst.h43
-rw-r--r--storage/xtradb/include/ut0mem.h39
-rw-r--r--storage/xtradb/include/ut0mem.ic23
-rw-r--r--storage/xtradb/lock/lock0lock.c25
-rw-r--r--storage/xtradb/log/log0log.c17
-rw-r--r--storage/xtradb/log/log0recv.c5
-rw-r--r--storage/xtradb/mtr/mtr0mtr.c36
-rw-r--r--storage/xtradb/os/os0file.c85
-rw-r--r--storage/xtradb/os/os0proc.c170
-rw-r--r--storage/xtradb/page/page0cur.c27
-rw-r--r--storage/xtradb/page/page0page.c63
-rw-r--r--storage/xtradb/page/page0zip.c208
-rw-r--r--storage/xtradb/que/que0que.c6
-rw-r--r--storage/xtradb/rem/rem0rec.c10
-rw-r--r--storage/xtradb/row/row0ins.c90
-rw-r--r--storage/xtradb/row/row0mysql.c45
-rw-r--r--storage/xtradb/row/row0row.c109
-rw-r--r--storage/xtradb/row/row0sel.c99
-rw-r--r--storage/xtradb/row/row0upd.c86
-rw-r--r--storage/xtradb/row/row0vers.c18
-rw-r--r--storage/xtradb/srv/srv0srv.c66
-rw-r--r--storage/xtradb/srv/srv0start.c18
-rw-r--r--storage/xtradb/sync/sync0arr.c32
-rw-r--r--storage/xtradb/sync/sync0rw.c7
-rw-r--r--storage/xtradb/sync/sync0sync.c12
-rw-r--r--storage/xtradb/trx/trx0i_s.c2
-rw-r--r--storage/xtradb/trx/trx0rec.c6
-rw-r--r--storage/xtradb/trx/trx0sys.c10
-rw-r--r--storage/xtradb/trx/trx0trx.c75
-rw-r--r--storage/xtradb/trx/trx0undo.c25
-rw-r--r--storage/xtradb/ut/ut0mem.c49
88 files changed, 4380 insertions, 2992 deletions
diff --git a/storage/xtradb/ChangeLog b/storage/xtradb/ChangeLog
index 102db3d7824..7a4cacb5b43 100644
--- a/storage/xtradb/ChangeLog
+++ b/storage/xtradb/ChangeLog
@@ -1,3 +1,110 @@
+2011-10-25 The InnoDB Team
+
+ * handler/ha_innodb.cc, row/row0ins.c:
+ Fix Bug#13002783 PARTIALLY UNINITIALIZED CASCADE UPDATE VECTOR
+
+2011-08-08 The InnoDB Team
+
+ * row/row0sel.c:
+ Fix Bug#12835650 VARCHAR maximum length performance impact
+
+2011-08-08 The InnoDB Team
+
+ * handler/ha_innodb.cc:
+ Fix Bug#12770537 I_S.TABLES.DATA_LENGTH DOES NOT SHOW ON-DISK SIZE
+ FOR COMPRESSED INNODB
+
+2011-07-19 The InnoDB Team
+
+ * buf/buf0buf.c, buf/buf0rea.c, handler/ha_innodb.cc,
+ include/buf0buf.h, include/buf0buf.ic, include/srv0srv.h,
+ srv/srv0srv.c:
+ Fix Bug#12356373 by reintroducing random readahead
+
+2011-06-30 The InnoDB Team
+
+ * row/row0row.c:
+ Fix Bug#12637786 Wrong secondary index entries on CHAR and VARCHAR
+ columns in ROW_FORMAT=DYNAMIC and ROW_FORMAT=COMPRESSED
+
+2011-06-16 The InnoDB Team
+
+ * btr/btr0cur.c, buf/buf0buddy.c, buf/buf0buf.c, buf/buf0lru.c,
+ include/buf0buddy.h, include/buf0buddy.ic, include/buf0buf.h,
+ include/buf0buf.ic, include/buf0lru.h, include/buf0types.h:
+ Fix Bug#61188 DROP TABLE extremely slow
+
+2011-06-16 The InnoDB Team
+
+ * buf/buf0buddy.c, buf/buf0buf.c, buf/buf0flu.c, buf/buf0lru.c,
+ include/buf0buf.h, include/buf0lru.h:
+ Fix Bug#61341 buf_LRU_insert_zip_clean can be O(N) on LRU length
+
+2011-06-16 The InnoDB Team
+
+ * page/page0zip.c, rem/rem0rec.c:
+ Fix Bug#61191 question about page_zip_available()
+
+2011-06-16 The InnoDB Team
+
+ * btr/btr0btr.c, btr/btr0cur.c, include/btr0btr.h, include/btr0cur.h,
+ include/btr0cur.ic, include/buf0buf.h, include/buf0buf.ic,
+ include/page0cur.ic, include/page0page.h, include/page0page.ic,
+ include/sync0rw.ic, include/sync0sync.h, page/page0cur.c,
+ page/page0page.c, row/row0ins.c, row/row0upd.c,
+ sync/sync0rw.c, sync/sync0sync.c:
+ Fix Bug#12612184 Race condition after btr_cur_pessimistic_update()
+
+2011-06-09 The InnoDB Team
+ * btr/btr0cur.c, include/rem0rec.h, include/rem0rec.ic,
+ * row/row0row.c, row/row0vers.c, trx/trx0rec.c:
+ Instrumentation for Bug#12612184 Race condition in row_upd_clust_rec()
+
+2011-05-19 The InnoDB Team
+
+ * row/row0row.c:
+ Fix Bug#12429576 Assertion failure on purge of column prefix index
+
+2011-04-07 The InnoDB Team
+
+ * handler/ha_innodb.cc, handler/ha_innodb.h, handler/handler0alter.cc:
+ Fix Bug #52409 Assertion failure: long semaphore wait
+
+2011-04-07 The InnoDB Team
+
+ * handler/ha_innodb.cc, include/trx0trx.h, include/trx0undo.h,
+ log/log0log.c, trx/trx0sys.c, trx/trx0trx.c, trx/trx0undo.c:
+ Fix Bug #59641 Prepared XA transaction in system after hard crash
+ causes future shutdown hang
+
+2011-03-30 The InnoDB Team
+
+ * srv/srv0srv.c, sync/sync0arr.h, sync/sync0arr.c:
+ Fix Bug#11877216 InnoDB too eager to commit suicide on a busy server
+
+2011-03-15 The InnoDB Team
+
+ * btr/btr0cur.c, page/page0zip.c:
+ Fix Bug#11849231 inflateInit() invoked without initializing all memory
+
+2011-02-28 The InnoDB Team
+
+ * btr/btr0sea.c, buf/buf0buf.c, buf/buf0lru.c:
+ Fix Bug#58549 Race condition in buf_LRU_drop_page_hash_for_tablespace()
+ and compressed tables
+
+2011-02-15 The InnoDB Team
+
+ * sync/sync0rw.c, innodb_bug59307.test:
+ Bug#59307 Valgrind: uninitialized value in
+ rw_lock_set_writer_id_and_recursion_flag()
+
+2011-02-14 The InnoDB Team
+
+ * handler/handler0alter.cc:
+ Bug#59749 Enabling concurrent reads while creating non-primary
+ unique index gives failures
+
2011-01-31 The InnoDB Team
* btr/btr0cur.c, include/row0upd.h,
diff --git a/storage/xtradb/btr/btr0btr.c b/storage/xtradb/btr/btr0btr.c
index 2fb14b06a7b..396ad422010 100644
--- a/storage/xtradb/btr/btr0btr.c
+++ b/storage/xtradb/btr/btr0btr.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -42,6 +42,560 @@ Created 6/2/1994 Heikki Tuuri
#include "ibuf0ibuf.h"
#include "trx0trx.h"
+#ifdef UNIV_BLOB_DEBUG
+# include "srv0srv.h"
+# include "ut0rbt.h"
+
+/** TRUE when messages about index->blobs modification are enabled. */
+static ibool btr_blob_dbg_msg;
+
+/** Issue a message about an operation on index->blobs.
+@param op operation
+@param b the entry being subjected to the operation
+@param ctx the context of the operation */
+#define btr_blob_dbg_msg_issue(op, b, ctx) \
+ fprintf(stderr, op " %u:%u:%u->%u %s(%u,%u,%u)\n", \
+ (b)->ref_page_no, (b)->ref_heap_no, \
+ (b)->ref_field_no, (b)->blob_page_no, ctx, \
+ (b)->owner, (b)->always_owner, (b)->del)
+
+/** Insert to index->blobs a reference to an off-page column.
+@param index the index tree
+@param b the reference
+@param ctx context (for logging) */
+UNIV_INTERN
+void
+btr_blob_dbg_rbt_insert(
+/*====================*/
+ dict_index_t* index, /*!< in/out: index tree */
+ const btr_blob_dbg_t* b, /*!< in: the reference */
+ const char* ctx) /*!< in: context (for logging) */
+{
+ if (btr_blob_dbg_msg) {
+ btr_blob_dbg_msg_issue("insert", b, ctx);
+ }
+ mutex_enter(&index->blobs_mutex);
+ rbt_insert(index->blobs, b, b);
+ mutex_exit(&index->blobs_mutex);
+}
+
+/** Remove from index->blobs a reference to an off-page column.
+@param index the index tree
+@param b the reference
+@param ctx context (for logging) */
+UNIV_INTERN
+void
+btr_blob_dbg_rbt_delete(
+/*====================*/
+ dict_index_t* index, /*!< in/out: index tree */
+ const btr_blob_dbg_t* b, /*!< in: the reference */
+ const char* ctx) /*!< in: context (for logging) */
+{
+ if (btr_blob_dbg_msg) {
+ btr_blob_dbg_msg_issue("delete", b, ctx);
+ }
+ mutex_enter(&index->blobs_mutex);
+ ut_a(rbt_delete(index->blobs, b));
+ mutex_exit(&index->blobs_mutex);
+}
+
+/**************************************************************//**
+Comparator for items (btr_blob_dbg_t) in index->blobs.
+The key in index->blobs is (ref_page_no, ref_heap_no, ref_field_no).
+@return negative, 0 or positive if *a<*b, *a=*b, *a>*b */
+static
+int
+btr_blob_dbg_cmp(
+/*=============*/
+ const void* a, /*!< in: first btr_blob_dbg_t to compare */
+ const void* b) /*!< in: second btr_blob_dbg_t to compare */
+{
+ const btr_blob_dbg_t* aa = a;
+ const btr_blob_dbg_t* bb = b;
+
+ ut_ad(aa != NULL);
+ ut_ad(bb != NULL);
+
+ if (aa->ref_page_no != bb->ref_page_no) {
+ return(aa->ref_page_no < bb->ref_page_no ? -1 : 1);
+ }
+ if (aa->ref_heap_no != bb->ref_heap_no) {
+ return(aa->ref_heap_no < bb->ref_heap_no ? -1 : 1);
+ }
+ if (aa->ref_field_no != bb->ref_field_no) {
+ return(aa->ref_field_no < bb->ref_field_no ? -1 : 1);
+ }
+ return(0);
+}
+
+/**************************************************************//**
+Add a reference to an off-page column to the index->blobs map. */
+UNIV_INTERN
+void
+btr_blob_dbg_add_blob(
+/*==================*/
+ const rec_t* rec, /*!< in: clustered index record */
+ ulint field_no, /*!< in: off-page column number */
+ ulint page_no, /*!< in: start page of the column */
+ dict_index_t* index, /*!< in/out: index tree */
+ const char* ctx) /*!< in: context (for logging) */
+{
+ btr_blob_dbg_t b;
+ const page_t* page = page_align(rec);
+
+ ut_a(index->blobs);
+
+ b.blob_page_no = page_no;
+ b.ref_page_no = page_get_page_no(page);
+ b.ref_heap_no = page_rec_get_heap_no(rec);
+ b.ref_field_no = field_no;
+ ut_a(b.ref_field_no >= index->n_uniq);
+ b.always_owner = b.owner = TRUE;
+ b.del = FALSE;
+ ut_a(!rec_get_deleted_flag(rec, page_is_comp(page)));
+ btr_blob_dbg_rbt_insert(index, &b, ctx);
+}
+
+/**************************************************************//**
+Add to index->blobs any references to off-page columns from a record.
+@return number of references added */
+UNIV_INTERN
+ulint
+btr_blob_dbg_add_rec(
+/*=================*/
+ const rec_t* rec, /*!< in: record */
+ dict_index_t* index, /*!< in/out: index */
+ const ulint* offsets,/*!< in: offsets */
+ const char* ctx) /*!< in: context (for logging) */
+{
+ ulint count = 0;
+ ulint i;
+ btr_blob_dbg_t b;
+ ibool del;
+
+ ut_ad(rec_offs_validate(rec, index, offsets));
+
+ if (!rec_offs_any_extern(offsets)) {
+ return(0);
+ }
+
+ b.ref_page_no = page_get_page_no(page_align(rec));
+ b.ref_heap_no = page_rec_get_heap_no(rec);
+ del = (rec_get_deleted_flag(rec, rec_offs_comp(offsets)) != 0);
+
+ for (i = 0; i < rec_offs_n_fields(offsets); i++) {
+ if (rec_offs_nth_extern(offsets, i)) {
+ ulint len;
+ const byte* field_ref = rec_get_nth_field(
+ rec, offsets, i, &len);
+
+ ut_a(len != UNIV_SQL_NULL);
+ ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
+ field_ref += len - BTR_EXTERN_FIELD_REF_SIZE;
+
+ if (!memcmp(field_ref, field_ref_zero,
+ BTR_EXTERN_FIELD_REF_SIZE)) {
+ /* the column has not been stored yet */
+ continue;
+ }
+
+ b.ref_field_no = i;
+ b.blob_page_no = mach_read_from_4(
+ field_ref + BTR_EXTERN_PAGE_NO);
+ ut_a(b.ref_field_no >= index->n_uniq);
+ b.always_owner = b.owner
+ = !(field_ref[BTR_EXTERN_LEN]
+ & BTR_EXTERN_OWNER_FLAG);
+ b.del = del;
+
+ btr_blob_dbg_rbt_insert(index, &b, ctx);
+ count++;
+ }
+ }
+
+ return(count);
+}
+
+/**************************************************************//**
+Display the references to off-page columns.
+This function is to be called from a debugger,
+for example when a breakpoint on ut_dbg_assertion_failed is hit. */
+UNIV_INTERN
+void
+btr_blob_dbg_print(
+/*===============*/
+ const dict_index_t* index) /*!< in: index tree */
+{
+ const ib_rbt_node_t* node;
+
+ if (!index->blobs) {
+ return;
+ }
+
+ /* We intentionally do not acquire index->blobs_mutex here.
+ This function is to be called from a debugger, and the caller
+ should make sure that the index->blobs_mutex is held. */
+
+ for (node = rbt_first(index->blobs);
+ node != NULL; node = rbt_next(index->blobs, node)) {
+ const btr_blob_dbg_t* b
+ = rbt_value(btr_blob_dbg_t, node);
+ fprintf(stderr, "%u:%u:%u->%u%s%s%s\n",
+ b->ref_page_no, b->ref_heap_no, b->ref_field_no,
+ b->blob_page_no,
+ b->owner ? "" : "(disowned)",
+ b->always_owner ? "" : "(has disowned)",
+ b->del ? "(deleted)" : "");
+ }
+}
+
+/**************************************************************//**
+Remove from index->blobs any references to off-page columns from a record.
+@return number of references removed */
+UNIV_INTERN
+ulint
+btr_blob_dbg_remove_rec(
+/*====================*/
+ const rec_t* rec, /*!< in: record */
+ dict_index_t* index, /*!< in/out: index */
+ const ulint* offsets,/*!< in: offsets */
+ const char* ctx) /*!< in: context (for logging) */
+{
+ ulint i;
+ ulint count = 0;
+ btr_blob_dbg_t b;
+
+ ut_ad(rec_offs_validate(rec, index, offsets));
+
+ if (!rec_offs_any_extern(offsets)) {
+ return(0);
+ }
+
+ b.ref_page_no = page_get_page_no(page_align(rec));
+ b.ref_heap_no = page_rec_get_heap_no(rec);
+
+ for (i = 0; i < rec_offs_n_fields(offsets); i++) {
+ if (rec_offs_nth_extern(offsets, i)) {
+ ulint len;
+ const byte* field_ref = rec_get_nth_field(
+ rec, offsets, i, &len);
+
+ ut_a(len != UNIV_SQL_NULL);
+ ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
+ field_ref += len - BTR_EXTERN_FIELD_REF_SIZE;
+
+ b.ref_field_no = i;
+ b.blob_page_no = mach_read_from_4(
+ field_ref + BTR_EXTERN_PAGE_NO);
+
+ switch (b.blob_page_no) {
+ case 0:
+ /* The column has not been stored yet.
+ The BLOB pointer must be all zero.
+ There cannot be a BLOB starting at
+ page 0, because page 0 is reserved for
+ the tablespace header. */
+ ut_a(!memcmp(field_ref, field_ref_zero,
+ BTR_EXTERN_FIELD_REF_SIZE));
+ /* fall through */
+ case FIL_NULL:
+ /* the column has been freed already */
+ continue;
+ }
+
+ btr_blob_dbg_rbt_delete(index, &b, ctx);
+ count++;
+ }
+ }
+
+ return(count);
+}
+
+/**************************************************************//**
+Check that there are no references to off-page columns from or to
+the given page. Invoked when freeing or clearing a page.
+@return TRUE when no orphan references exist */
+UNIV_INTERN
+ibool
+btr_blob_dbg_is_empty(
+/*==================*/
+ dict_index_t* index, /*!< in: index */
+ ulint page_no) /*!< in: page number */
+{
+ const ib_rbt_node_t* node;
+ ibool success = TRUE;
+
+ if (!index->blobs) {
+ return(success);
+ }
+
+ mutex_enter(&index->blobs_mutex);
+
+ for (node = rbt_first(index->blobs);
+ node != NULL; node = rbt_next(index->blobs, node)) {
+ const btr_blob_dbg_t* b
+ = rbt_value(btr_blob_dbg_t, node);
+
+ if (b->ref_page_no != page_no && b->blob_page_no != page_no) {
+ continue;
+ }
+
+ fprintf(stderr,
+ "InnoDB: orphan BLOB ref%s%s%s %u:%u:%u->%u\n",
+ b->owner ? "" : "(disowned)",
+ b->always_owner ? "" : "(has disowned)",
+ b->del ? "(deleted)" : "",
+ b->ref_page_no, b->ref_heap_no, b->ref_field_no,
+ b->blob_page_no);
+
+ if (b->blob_page_no != page_no || b->owner || !b->del) {
+ success = FALSE;
+ }
+ }
+
+ mutex_exit(&index->blobs_mutex);
+ return(success);
+}
+
+/**************************************************************//**
+Count and process all references to off-page columns on a page.
+@return number of references processed */
+UNIV_INTERN
+ulint
+btr_blob_dbg_op(
+/*============*/
+ const page_t* page, /*!< in: B-tree leaf page */
+ const rec_t* rec, /*!< in: record to start from
+ (NULL to process the whole page) */
+ dict_index_t* index, /*!< in/out: index */
+ const char* ctx, /*!< in: context (for logging) */
+ const btr_blob_dbg_op_f op) /*!< in: operation on records */
+{
+ ulint count = 0;
+ mem_heap_t* heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ ulint* offsets = offsets_;
+ rec_offs_init(offsets_);
+
+ ut_a(fil_page_get_type(page) == FIL_PAGE_INDEX);
+ ut_a(!rec || page_align(rec) == page);
+
+ if (!index->blobs || !page_is_leaf(page)
+ || !dict_index_is_clust(index)) {
+ return(0);
+ }
+
+ if (rec == NULL) {
+ rec = page_get_infimum_rec(page);
+ }
+
+ do {
+ offsets = rec_get_offsets(rec, index, offsets,
+ ULINT_UNDEFINED, &heap);
+ count += op(rec, index, offsets, ctx);
+ rec = page_rec_get_next_const(rec);
+ } while (!page_rec_is_supremum(rec));
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+
+ return(count);
+}
+
+/**************************************************************//**
+Count and add to index->blobs any references to off-page columns
+from records on a page.
+@return number of references added */
+UNIV_INTERN
+ulint
+btr_blob_dbg_add(
+/*=============*/
+ const page_t* page, /*!< in: rewritten page */
+ dict_index_t* index, /*!< in/out: index */
+ const char* ctx) /*!< in: context (for logging) */
+{
+ btr_blob_dbg_assert_empty(index, page_get_page_no(page));
+
+ return(btr_blob_dbg_op(page, NULL, index, ctx, btr_blob_dbg_add_rec));
+}
+
+/**************************************************************//**
+Count and remove from index->blobs any references to off-page columns
+from records on a page.
+Used when reorganizing a page, before copying the records.
+@return number of references removed */
+UNIV_INTERN
+ulint
+btr_blob_dbg_remove(
+/*================*/
+ const page_t* page, /*!< in: b-tree page */
+ dict_index_t* index, /*!< in/out: index */
+ const char* ctx) /*!< in: context (for logging) */
+{
+ ulint count;
+
+ count = btr_blob_dbg_op(page, NULL, index, ctx,
+ btr_blob_dbg_remove_rec);
+
+ /* Check that no references exist. */
+ btr_blob_dbg_assert_empty(index, page_get_page_no(page));
+
+ return(count);
+}
+
+/**************************************************************//**
+Restore in index->blobs any references to off-page columns
+Used when page reorganize fails due to compressed page overflow. */
+UNIV_INTERN
+void
+btr_blob_dbg_restore(
+/*=================*/
+ const page_t* npage, /*!< in: page that failed to compress */
+ const page_t* page, /*!< in: copy of original page */
+ dict_index_t* index, /*!< in/out: index */
+ const char* ctx) /*!< in: context (for logging) */
+{
+ ulint removed;
+ ulint added;
+
+ ut_a(page_get_page_no(npage) == page_get_page_no(page));
+ ut_a(page_get_space_id(npage) == page_get_space_id(page));
+
+ removed = btr_blob_dbg_remove(npage, index, ctx);
+ added = btr_blob_dbg_add(page, index, ctx);
+ ut_a(added == removed);
+}
+
+/**************************************************************//**
+Modify the 'deleted' flag of a record. */
+UNIV_INTERN
+void
+btr_blob_dbg_set_deleted_flag(
+/*==========================*/
+ const rec_t* rec, /*!< in: record */
+ dict_index_t* index, /*!< in/out: index */
+ const ulint* offsets,/*!< in: rec_get_offs(rec, index) */
+ ibool del) /*!< in: TRUE=deleted, FALSE=exists */
+{
+ const ib_rbt_node_t* node;
+ btr_blob_dbg_t b;
+ btr_blob_dbg_t* c;
+ ulint i;
+
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ ut_a(dict_index_is_clust(index));
+ ut_a(del == !!del);/* must be FALSE==0 or TRUE==1 */
+
+ if (!rec_offs_any_extern(offsets) || !index->blobs) {
+
+ return;
+ }
+
+ b.ref_page_no = page_get_page_no(page_align(rec));
+ b.ref_heap_no = page_rec_get_heap_no(rec);
+
+ for (i = 0; i < rec_offs_n_fields(offsets); i++) {
+ if (rec_offs_nth_extern(offsets, i)) {
+ ulint len;
+ const byte* field_ref = rec_get_nth_field(
+ rec, offsets, i, &len);
+
+ ut_a(len != UNIV_SQL_NULL);
+ ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
+ field_ref += len - BTR_EXTERN_FIELD_REF_SIZE;
+
+ b.ref_field_no = i;
+ b.blob_page_no = mach_read_from_4(
+ field_ref + BTR_EXTERN_PAGE_NO);
+
+ switch (b.blob_page_no) {
+ case 0:
+ ut_a(memcmp(field_ref, field_ref_zero,
+ BTR_EXTERN_FIELD_REF_SIZE));
+ /* page number 0 is for the
+ page allocation bitmap */
+ case FIL_NULL:
+ /* the column has been freed already */
+ ut_error;
+ }
+
+ mutex_enter(&index->blobs_mutex);
+ node = rbt_lookup(index->blobs, &b);
+ ut_a(node);
+
+ c = rbt_value(btr_blob_dbg_t, node);
+ /* The flag should be modified. */
+ c->del = del;
+ if (btr_blob_dbg_msg) {
+ b = *c;
+ mutex_exit(&index->blobs_mutex);
+ btr_blob_dbg_msg_issue("del_mk", &b, "");
+ } else {
+ mutex_exit(&index->blobs_mutex);
+ }
+ }
+ }
+}
+
+/**************************************************************//**
+Change the ownership of an off-page column. */
+UNIV_INTERN
+void
+btr_blob_dbg_owner(
+/*===============*/
+ const rec_t* rec, /*!< in: record */
+ dict_index_t* index, /*!< in/out: index */
+ const ulint* offsets,/*!< in: rec_get_offs(rec, index) */
+ ulint i, /*!< in: ith field in rec */
+ ibool own) /*!< in: TRUE=owned, FALSE=disowned */
+{
+ const ib_rbt_node_t* node;
+ btr_blob_dbg_t b;
+ const byte* field_ref;
+ ulint len;
+
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ ut_a(rec_offs_nth_extern(offsets, i));
+
+ field_ref = rec_get_nth_field(rec, offsets, i, &len);
+ ut_a(len != UNIV_SQL_NULL);
+ ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
+ field_ref += len - BTR_EXTERN_FIELD_REF_SIZE;
+
+ b.ref_page_no = page_get_page_no(page_align(rec));
+ b.ref_heap_no = page_rec_get_heap_no(rec);
+ b.ref_field_no = i;
+ b.owner = !(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG);
+ b.blob_page_no = mach_read_from_4(field_ref + BTR_EXTERN_PAGE_NO);
+
+ ut_a(b.owner == own);
+
+ mutex_enter(&index->blobs_mutex);
+ node = rbt_lookup(index->blobs, &b);
+ /* row_ins_clust_index_entry_by_modify() invokes
+ btr_cur_unmark_extern_fields() also for the newly inserted
+ references, which are all zero bytes until the columns are stored.
+ The node lookup must fail if and only if that is the case. */
+ ut_a(!memcmp(field_ref, field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE)
+ == !node);
+
+ if (node) {
+ btr_blob_dbg_t* c = rbt_value(btr_blob_dbg_t, node);
+ /* Some code sets ownership from TRUE to TRUE.
+ We do not allow changing ownership from FALSE to FALSE. */
+ ut_a(own || c->owner);
+
+ c->owner = own;
+ if (!own) {
+ c->always_owner = FALSE;
+ }
+ }
+
+ mutex_exit(&index->blobs_mutex);
+}
+#endif /* UNIV_BLOB_DEBUG */
+
/*
Latching strategy of the InnoDB B-tree
--------------------------------------
@@ -289,7 +843,7 @@ btr_get_next_user_rec(
/**************************************************************//**
Creates a new index page (not the root, and also not
used in page reorganization). @see btr_page_empty(). */
-static
+UNIV_INTERN
void
btr_page_create(
/*============*/
@@ -302,6 +856,7 @@ btr_page_create(
page_t* page = buf_block_get_frame(block);
ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+ btr_blob_dbg_assert_empty(index, buf_block_get_page_no(block));
if (UNIV_LIKELY_NULL(page_zip)) {
page_create_zip(block, index, level, mtr);
@@ -501,6 +1056,7 @@ btr_page_free_low(
modify clock */
buf_block_modify_clock_inc(block);
+ btr_blob_dbg_assert_empty(index, buf_block_get_page_no(block));
if (dict_index_is_ibuf(index)) {
@@ -785,6 +1341,13 @@ btr_create(
block = buf_page_get(space, zip_size, page_no,
RW_X_LATCH, mtr);
} else {
+#ifdef UNIV_BLOB_DEBUG
+ if ((type & DICT_CLUSTERED) && !index->blobs) {
+ mutex_create(&index->blobs_mutex, SYNC_ANY_LATCH);
+ index->blobs = rbt_create(sizeof(btr_blob_dbg_t),
+ btr_blob_dbg_cmp);
+ }
+#endif /* UNIV_BLOB_DEBUG */
block = fseg_create(space, 0,
PAGE_HEADER + PAGE_BTR_SEG_TOP, mtr);
}
@@ -1026,6 +1589,7 @@ btr_page_reorganize_low(
block->check_index_page_at_flush = TRUE;
#endif /* !UNIV_HOTBACKUP */
+ btr_blob_dbg_remove(page, index, "btr_page_reorganize");
/* Recreate the page: note that global data on page (possible
segment headers, next page-field, etc.) is preserved intact */
@@ -1054,6 +1618,8 @@ btr_page_reorganize_low(
(!page_zip_compress(page_zip, page, index, NULL))) {
/* Restore the old page and exit. */
+ btr_blob_dbg_restore(page, temp_page, index,
+ "btr_page_reorganize_compress_fail");
#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
/* Check that the bytes that we skip are identical. */
@@ -1168,7 +1734,7 @@ btr_parse_page_reorganize(
#ifndef UNIV_HOTBACKUP
/*************************************************************//**
Empties an index page. @see btr_page_create(). */
-static
+UNIV_INTERN
void
btr_page_empty(
/*===========*/
@@ -1187,6 +1753,7 @@ btr_page_empty(
#endif /* UNIV_ZIP_DEBUG */
btr_search_drop_page_hash_index(block);
+ btr_blob_dbg_remove(page, index, "btr_page_empty");
/* Recreate the page: note that global data on page (possible
segment headers, next page-field, etc.) is preserved intact */
@@ -1729,13 +2296,13 @@ btr_insert_on_non_leaf_level_func(
/**************************************************************//**
Attaches the halves of an index page on the appropriate level in an
index tree. */
-static
+UNIV_INTERN
void
btr_attach_half_pages(
/*==================*/
dict_index_t* index, /*!< in: the index tree */
buf_block_t* block, /*!< in/out: page to be split */
- rec_t* split_rec, /*!< in: first record on upper
+ const rec_t* split_rec, /*!< in: first record on upper
half page */
buf_block_t* new_block, /*!< in/out: the new half page */
ulint direction, /*!< in: FSP_UP or FSP_DOWN */
@@ -2427,15 +2994,16 @@ btr_node_ptr_delete(
ut_a(err == DB_SUCCESS);
if (!compressed) {
- btr_cur_compress_if_useful(&cursor, mtr);
+ btr_cur_compress_if_useful(&cursor, FALSE, mtr);
}
}
/*************************************************************//**
If page is the only on its level, this function moves its records to the
-father page, thus reducing the tree height. */
+father page, thus reducing the tree height.
+@return father block */
static
-void
+buf_block_t*
btr_lift_page_up(
/*=============*/
dict_index_t* index, /*!< in: index tree */
@@ -2527,6 +3095,7 @@ btr_lift_page_up(
index);
}
+ btr_blob_dbg_remove(page, index, "btr_lift_page_up");
lock_update_copy_and_discard(father_block, block);
/* Go upward to root page, decrementing levels by one. */
@@ -2551,6 +3120,8 @@ btr_lift_page_up(
}
ut_ad(page_validate(father_page, index));
ut_ad(btr_check_node_ptr(index, father_block, mtr));
+
+ return(father_block);
}
/*************************************************************//**
@@ -2567,11 +3138,13 @@ UNIV_INTERN
ibool
btr_compress(
/*=========*/
- btr_cur_t* cursor, /*!< in: cursor on the page to merge or lift;
- the page must not be empty: in record delete
- use btr_discard_page if the page would become
- empty */
- mtr_t* mtr) /*!< in: mtr */
+ btr_cur_t* cursor, /*!< in/out: cursor on the page to merge
+ or lift; the page must not be empty:
+ when deleting records, use btr_discard_page()
+ if the page would become empty */
+ ibool adjust, /*!< in: TRUE if should adjust the
+ cursor position even if compression occurs */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
{
dict_index_t* index;
ulint space;
@@ -2589,12 +3162,14 @@ btr_compress(
ulint* offsets;
ulint data_size;
ulint n_recs;
+ ulint nth_rec = 0; /* remove bogus warning */
ulint max_ins_size;
ulint max_ins_size_reorg;
block = btr_cur_get_block(cursor);
page = btr_cur_get_page(cursor);
index = btr_cur_get_index(cursor);
+
ut_a((ibool) !!page_is_comp(page) == dict_table_is_comp(index->table));
ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
@@ -2615,6 +3190,10 @@ btr_compress(
offsets = btr_page_get_father_block(NULL, heap, index, block, mtr,
&father_cursor);
+ if (adjust) {
+ nth_rec = page_rec_get_n_recs_before(btr_cur_get_rec(cursor));
+ }
+
/* Decide the page to which we try to merge and which will inherit
the locks */
@@ -2641,9 +3220,9 @@ btr_compress(
} else {
/* The page is the only one on the level, lift the records
to the father */
- btr_lift_page_up(index, block, mtr);
- mem_heap_free(heap);
- return(TRUE);
+
+ merge_block = btr_lift_page_up(index, block, mtr);
+ goto func_exit;
}
n_recs = page_get_n_recs(page);
@@ -2725,6 +3304,10 @@ err_exit:
btr_node_ptr_delete(index, block, mtr);
lock_update_merge_left(merge_block, orig_pred, block);
+
+ if (adjust) {
+ nth_rec += page_rec_get_n_recs_before(orig_pred);
+ }
} else {
rec_t* orig_succ;
#ifdef UNIV_BTR_DEBUG
@@ -2788,7 +3371,7 @@ err_exit:
lock_update_merge_right(merge_block, orig_succ, block);
}
- mem_heap_free(heap);
+ btr_blob_dbg_remove(page, index, "btr_compress");
if (!dict_index_is_clust(index) && page_is_leaf(merge_page)) {
/* Update the free bits of the B-tree page in the
@@ -2840,6 +3423,16 @@ err_exit:
btr_page_free(index, block, mtr);
ut_ad(btr_check_node_ptr(index, merge_block, mtr));
+func_exit:
+ mem_heap_free(heap);
+
+ if (adjust) {
+ btr_cur_position(
+ index,
+ page_rec_get_nth(merge_block->frame, nth_rec),
+ merge_block, cursor);
+ }
+
return(TRUE);
}
@@ -3018,6 +3611,8 @@ btr_discard_page(
block);
}
+ btr_blob_dbg_remove(page, index, "btr_discard_page");
+
/* Free the file page */
btr_page_free(index, block, mtr);
diff --git a/storage/xtradb/btr/btr0cur.c b/storage/xtradb/btr/btr0cur.c
index d16522731c4..51a2f784fbf 100644
--- a/storage/xtradb/btr/btr0cur.c
+++ b/storage/xtradb/btr/btr0cur.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2008, Google Inc.
Portions of this file contain modifications contributed and copyrighted by
@@ -1046,6 +1046,11 @@ btr_cur_ins_lock_and_undo(
rec_t* rec;
roll_ptr_t roll_ptr;
+ if (thr && thr_get_trx(thr)->fake_changes) {
+ /* skip LOCK, UNDO */
+ return(DB_SUCCESS);
+ }
+
/* Check if we have to wait for a lock: enqueue an explicit lock
request if yes */
@@ -1177,7 +1182,7 @@ btr_cur_optimistic_insert(
}
#endif /* UNIV_DEBUG */
- ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+ ut_ad((thr && thr_get_trx(thr)->fake_changes) || mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
max_size = page_get_max_insert_size_after_reorganize(page, 1);
leaf = page_is_leaf(page);
@@ -1272,6 +1277,12 @@ fail_err:
goto fail_err;
}
+ if (thr && thr_get_trx(thr)->fake_changes) {
+ /* skip CHANGE, LOG */
+ *big_rec = big_rec_vec;
+ return(err); /* == DB_SUCCESS */
+ }
+
page_cursor = btr_cur_get_page_cur(cursor);
/* Now, try the insert */
@@ -1414,10 +1425,10 @@ btr_cur_pessimistic_insert(
*big_rec = NULL;
- ut_ad(mtr_memo_contains(mtr,
+ ut_ad((thr && thr_get_trx(thr)->fake_changes) || mtr_memo_contains(mtr,
dict_index_get_lock(btr_cur_get_index(cursor)),
MTR_MEMO_X_LOCK));
- ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor),
+ ut_ad((thr && thr_get_trx(thr)->fake_changes) || mtr_memo_contains(mtr, btr_cur_get_block(cursor),
MTR_MEMO_PAGE_X_FIX));
/* Try first an optimistic insert; reset the cursor flag: we do not
@@ -1483,6 +1494,16 @@ btr_cur_pessimistic_insert(
}
}
+ if (thr && thr_get_trx(thr)->fake_changes) {
+ /* skip CHANGE, LOG */
+ if (n_extents > 0) {
+ fil_space_release_free_extents(index->space,
+ n_reserved);
+ }
+ *big_rec = big_rec_vec;
+ return(DB_SUCCESS);
+ }
+
if (dict_index_get_page(index)
== buf_block_get_page_no(btr_cur_get_block(cursor))) {
@@ -1539,6 +1560,11 @@ btr_cur_upd_lock_and_undo(
ut_ad(cursor && update && thr && roll_ptr);
+ if (thr && thr_get_trx(thr)->fake_changes) {
+ /* skip LOCK, UNDO */
+ return(DB_SUCCESS);
+ }
+
rec = btr_cur_get_rec(cursor);
index = cursor->index;
@@ -1837,6 +1863,14 @@ btr_cur_update_in_place(
return(err);
}
+ if (trx->fake_changes) {
+ /* skip CHANGE, LOG */
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ return(err); /* == DB_SUCCESS */
+ }
+
if (block->is_hashed) {
/* The function row_upd_changes_ord_field_binary works only
if the update vector was built for a clustered index, we must
@@ -1929,7 +1963,6 @@ btr_cur_optimistic_update(
ulint old_rec_size;
dtuple_t* new_entry;
roll_ptr_t roll_ptr;
- trx_t* trx;
mem_heap_t* heap;
ulint i;
ulint n_ext;
@@ -1940,12 +1973,16 @@ btr_cur_optimistic_update(
rec = btr_cur_get_rec(cursor);
index = cursor->index;
ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
- ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+ ut_ad((thr && thr_get_trx(thr)->fake_changes) || mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
/* The insert buffer tree should never be updated in place. */
ut_ad(!dict_index_is_ibuf(index));
heap = mem_heap_create(1024);
offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap);
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+ ut_a(!rec_offs_any_null_extern(rec, offsets)
+ || trx_is_recv(thr_get_trx(thr)));
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
#ifdef UNIV_DEBUG
if (btr_cur_print_record_ops && thr) {
@@ -2050,6 +2087,11 @@ any_extern:
goto err_exit;
}
+ if (thr && thr_get_trx(thr)->fake_changes) {
+ /* skip CHANGE, LOG */
+ goto err_exit; /* == DB_SUCCESS */
+ }
+
/* Ok, we may do the replacement. Store on the page infimum the
explicit locks on rec, before deleting rec (see the comment in
btr_cur_pessimistic_update). */
@@ -2068,13 +2110,11 @@ any_extern:
page_cur_move_to_prev(page_cursor);
- trx = thr_get_trx(thr);
-
if (!(flags & BTR_KEEP_SYS_FLAG)) {
row_upd_index_entry_sys_field(new_entry, index, DATA_ROLL_PTR,
roll_ptr);
row_upd_index_entry_sys_field(new_entry, index, DATA_TRX_ID,
- trx->id);
+ thr_get_trx(thr)->id);
}
/* There are no externally stored columns in new_entry */
@@ -2160,7 +2200,9 @@ btr_cur_pessimistic_update(
/*=======================*/
ulint flags, /*!< in: undo logging, locking, and rollback
flags */
- btr_cur_t* cursor, /*!< in: cursor on the record to update */
+ btr_cur_t* cursor, /*!< in/out: cursor on the record to update;
+ cursor may become invalid if *big_rec == NULL
+ || !(flags & BTR_KEEP_POS_FLAG) */
mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */
big_rec_t** big_rec,/*!< out: big rec vector whose fields have to
be stored externally by the caller, or NULL */
@@ -2200,9 +2242,9 @@ btr_cur_pessimistic_update(
rec = btr_cur_get_rec(cursor);
index = cursor->index;
- ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
+ ut_ad((thr && thr_get_trx(thr)->fake_changes) || mtr_memo_contains(mtr, dict_index_get_lock(index),
MTR_MEMO_X_LOCK));
- ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+ ut_ad((thr && thr_get_trx(thr)->fake_changes) || mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
#ifdef UNIV_ZIP_DEBUG
ut_a(!page_zip || page_zip_validate(page_zip, page));
#endif /* UNIV_ZIP_DEBUG */
@@ -2290,6 +2332,9 @@ btr_cur_pessimistic_update(
ut_ad(big_rec_vec == NULL);
+ /* fake_changes should not cause undo. so never reaches here */
+ ut_ad(!(trx->fake_changes));
+
btr_rec_free_updated_extern_fields(
index, rec, page_zip, offsets, update,
trx_is_recv(trx) ? RB_RECOVERY : RB_NORMAL, mtr);
@@ -2299,7 +2344,7 @@ btr_cur_pessimistic_update(
record to be inserted: we have to remember which fields were such */
ut_ad(!page_is_comp(page) || !rec_get_node_ptr_flag(rec));
- offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, heap);
+ ut_ad(rec_offs_validate(rec, index, offsets));
n_ext += btr_push_update_extern_fields(new_entry, update, *heap);
if (UNIV_LIKELY_NULL(page_zip)) {
@@ -2322,6 +2367,16 @@ make_external:
err = DB_TOO_BIG_RECORD;
goto return_after_reservations;
}
+
+ ut_ad(page_is_leaf(page));
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(flags & BTR_KEEP_POS_FLAG);
+ }
+
+ if (trx->fake_changes) {
+ /* skip CHANGE, LOG */
+ err = DB_SUCCESS;
+ goto return_after_reservations;
}
/* Store state of explicit locks on rec on the page infimum record,
@@ -2349,6 +2404,8 @@ make_external:
rec = btr_cur_insert_if_possible(cursor, new_entry, n_ext, mtr);
if (rec) {
+ page_cursor->rec = rec;
+
lock_rec_restore_from_page_infimum(btr_cur_get_block(cursor),
rec, block);
@@ -2362,7 +2419,10 @@ make_external:
rec, index, offsets, mtr);
}
- btr_cur_compress_if_useful(cursor, mtr);
+ btr_cur_compress_if_useful(
+ cursor,
+ big_rec_vec != NULL && (flags & BTR_KEEP_POS_FLAG),
+ mtr);
if (page_zip && !dict_index_is_clust(index)
&& page_is_leaf(page)) {
@@ -2382,6 +2442,21 @@ make_external:
}
}
+ if (big_rec_vec) {
+ ut_ad(page_is_leaf(page));
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(flags & BTR_KEEP_POS_FLAG);
+
+ /* btr_page_split_and_insert() in
+ btr_cur_pessimistic_insert() invokes
+ mtr_memo_release(mtr, index->lock, MTR_MEMO_X_LOCK).
+ We must keep the index->lock when we created a
+ big_rec, so that row_upd_clust_rec() can store the
+ big_rec in the same mini-transaction. */
+
+ mtr_x_lock(dict_index_get_lock(index), mtr);
+ }
+
/* Was the record to be updated positioned as the first user
record on its page? */
was_first = page_cur_is_before_first(page_cursor);
@@ -2397,6 +2472,7 @@ make_external:
ut_a(rec);
ut_a(err == DB_SUCCESS);
ut_a(dummy_big_rec == NULL);
+ page_cursor->rec = rec;
if (dict_index_is_sec_or_ibuf(index)) {
/* Update PAGE_MAX_TRX_ID in the index page header.
@@ -2455,6 +2531,39 @@ return_after_reservations:
return(err);
}
+/**************************************************************//**
+Commits and restarts a mini-transaction so that it will retain an
+x-lock on index->lock and the cursor page. */
+UNIV_INTERN
+void
+btr_cur_mtr_commit_and_start(
+/*=========================*/
+ btr_cur_t* cursor, /*!< in: cursor */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ buf_block_t* block;
+
+ block = btr_cur_get_block(cursor);
+
+ ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(cursor->index),
+ MTR_MEMO_X_LOCK));
+ ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+ /* Keep the locks across the mtr_commit(mtr). */
+ rw_lock_x_lock(dict_index_get_lock(cursor->index));
+ rw_lock_x_lock(&block->lock);
+ mutex_enter(&block->mutex);
+ buf_block_buf_fix_inc(block, __FILE__, __LINE__);
+ mutex_exit(&block->mutex);
+ /* Write out the redo log. */
+ mtr_commit(mtr);
+ mtr_start(mtr);
+ /* Reassociate the locks with the mini-transaction.
+ They will be released on mtr_commit(mtr). */
+ mtr_memo_push(mtr, dict_index_get_lock(cursor->index),
+ MTR_MEMO_X_LOCK);
+ mtr_memo_push(mtr, block, MTR_MEMO_PAGE_X_FIX);
+}
+
/*==================== B-TREE DELETE MARK AND UNMARK ===============*/
/****************************************************************//**
@@ -2625,6 +2734,11 @@ btr_cur_del_mark_set_clust_rec(
ut_ad(dict_index_is_clust(index));
ut_ad(!rec_get_deleted_flag(rec, rec_offs_comp(offsets)));
+ if (thr && thr_get_trx(thr)->fake_changes) {
+ /* skip LOCK, UNDO, CHANGE, LOG */
+ return(DB_SUCCESS);
+ }
+
err = lock_clust_rec_modify_check_and_lock(flags, block,
rec, index, offsets, thr);
@@ -2647,6 +2761,7 @@ btr_cur_del_mark_set_clust_rec(
page_zip = buf_block_get_page_zip(block);
+ btr_blob_dbg_set_deleted_flag(rec, index, offsets, val);
btr_rec_set_deleted_flag(rec, page_zip, val);
trx = thr_get_trx(thr);
@@ -2761,6 +2876,11 @@ btr_cur_del_mark_set_sec_rec(
rec_t* rec;
ulint err;
+ if (thr && thr_get_trx(thr)->fake_changes) {
+ /* skip LOCK, CHANGE, LOG */
+ return(DB_SUCCESS);
+ }
+
block = btr_cur_get_block(cursor);
rec = btr_cur_get_rec(cursor);
@@ -2833,10 +2953,12 @@ UNIV_INTERN
ibool
btr_cur_compress_if_useful(
/*=======================*/
- btr_cur_t* cursor, /*!< in: cursor on the page to compress;
- cursor does not stay valid if compression
- occurs */
- mtr_t* mtr) /*!< in: mtr */
+ btr_cur_t* cursor, /*!< in/out: cursor on the page to compress;
+ cursor does not stay valid if !adjust and
+ compression occurs */
+ ibool adjust, /*!< in: TRUE if should adjust the
+ cursor position even if compression occurs */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
{
ut_ad(mtr_memo_contains(mtr,
dict_index_get_lock(btr_cur_get_index(cursor)),
@@ -2845,7 +2967,7 @@ btr_cur_compress_if_useful(
MTR_MEMO_PAGE_X_FIX));
return(btr_cur_compress_recommendation(cursor, mtr)
- && btr_compress(cursor, mtr));
+ && btr_compress(cursor, adjust, mtr));
}
/*******************************************************//**
@@ -3092,7 +3214,7 @@ return_after_reservations:
mem_heap_free(heap);
if (ret == FALSE) {
- ret = btr_cur_compress_if_useful(cursor, mtr);
+ ret = btr_cur_compress_if_useful(cursor, FALSE, mtr);
}
if (n_extents > 0) {
@@ -3116,6 +3238,7 @@ btr_cur_add_path_info(
{
btr_path_t* slot;
rec_t* rec;
+ page_t* page;
ut_a(cursor->path_arr);
@@ -3138,8 +3261,155 @@ btr_cur_add_path_info(
slot = cursor->path_arr + (root_height - height);
+ page = page_align(rec);
+
slot->nth_rec = page_rec_get_n_recs_before(rec);
- slot->n_recs = page_get_n_recs(page_align(rec));
+ slot->n_recs = page_get_n_recs(page);
+ slot->page_no = page_get_page_no(page);
+ slot->page_level = btr_page_get_level_low(page);
+}
+
+/*******************************************************************//**
+Estimate the number of rows between slot1 and slot2 for any level on a
+B-tree. This function starts from slot1->page and reads a few pages to
+the right, counting their records. If we reach slot2->page quickly then
+we know exactly how many records there are between slot1 and slot2 and
+we set is_n_rows_exact to TRUE. If we cannot reach slot2->page quickly
+then we calculate the average number of records in the pages scanned
+so far and assume that all pages that we did not scan up to slot2->page
+contain the same number of records, then we multiply that average to
+the number of pages between slot1->page and slot2->page (which is
+n_rows_on_prev_level). In this case we set is_n_rows_exact to FALSE.
+@return number of rows (exact or estimated) */
+static
+ib_int64_t
+btr_estimate_n_rows_in_range_on_level(
+/*==================================*/
+ dict_index_t* index, /*!< in: index */
+ btr_path_t* slot1, /*!< in: left border */
+ btr_path_t* slot2, /*!< in: right border */
+ ib_int64_t n_rows_on_prev_level, /*!< in: number of rows
+ on the previous level for the
+ same descend paths; used to
+ determine the numbe of pages
+ on this level */
+ ibool* is_n_rows_exact) /*!< out: TRUE if the returned
+ value is exact i.e. not an
+ estimation */
+{
+ ulint space;
+ ib_int64_t n_rows;
+ ulint n_pages_read;
+ ulint page_no;
+ ulint zip_size;
+ ulint level;
+
+ space = dict_index_get_space(index);
+
+ n_rows = 0;
+ n_pages_read = 0;
+
+ /* Assume by default that we will scan all pages between
+ slot1->page_no and slot2->page_no */
+ *is_n_rows_exact = TRUE;
+
+ /* add records from slot1->page_no which are to the right of
+ the record which serves as a left border of the range, if any */
+ if (slot1->nth_rec < slot1->n_recs) {
+ n_rows += slot1->n_recs - slot1->nth_rec;
+ }
+
+ /* add records from slot2->page_no which are to the left of
+ the record which servers as a right border of the range, if any */
+ if (slot2->nth_rec > 1) {
+ n_rows += slot2->nth_rec - 1;
+ }
+
+ /* count the records in the pages between slot1->page_no and
+ slot2->page_no (non inclusive), if any */
+
+ zip_size = fil_space_get_zip_size(space);
+
+ /* Do not read more than this number of pages in order not to hurt
+ performance with this code which is just an estimation. If we read
+ this many pages before reaching slot2->page_no then we estimate the
+ average from the pages scanned so far */
+ #define N_PAGES_READ_LIMIT 10
+
+ page_no = slot1->page_no;
+ level = slot1->page_level;
+
+ do {
+ mtr_t mtr;
+ page_t* page;
+ buf_block_t* block;
+
+ mtr_start(&mtr);
+
+ /* fetch the page */
+ block = buf_page_get(space, zip_size, page_no, RW_S_LATCH,
+ &mtr);
+
+ page = buf_block_get_frame(block);
+
+ /* It is possible that the tree has been reorganized in the
+ meantime and this is a different page. If this happens the
+ calculated estimate will be bogus, which is not fatal as
+ this is only an estimate. We are sure that a page with
+ page_no exists because InnoDB never frees pages, only
+ reuses them. */
+ if (fil_page_get_type(page) != FIL_PAGE_INDEX
+ || ut_dulint_cmp(btr_page_get_index_id(page), index->id)
+ || btr_page_get_level_low(page) != level) {
+
+ /* The page got reused for something else */
+ goto inexact;
+ }
+
+ n_pages_read++;
+
+ if (page_no != slot1->page_no) {
+ /* Do not count the records on slot1->page_no,
+ we already counted them before this loop. */
+ n_rows += page_get_n_recs(page);
+ }
+
+ page_no = btr_page_get_next(page, &mtr);
+
+ mtr_commit(&mtr);
+
+ if (n_pages_read == N_PAGES_READ_LIMIT
+ || page_no == FIL_NULL) {
+ /* Either we read too many pages or
+ we reached the end of the level without passing
+ through slot2->page_no, the tree must have changed
+ in the meantime */
+ goto inexact;
+ }
+
+ } while (page_no != slot2->page_no);
+
+ return(n_rows);
+
+inexact:
+
+ *is_n_rows_exact = FALSE;
+
+ /* We did interrupt before reaching slot2->page */
+
+ if (n_pages_read > 0) {
+ /* The number of pages on this level is
+ n_rows_on_prev_level, multiply it by the
+ average number of recs per page so far */
+ n_rows = n_rows_on_prev_level
+ * n_rows / n_pages_read;
+ } else {
+ /* The tree changed before we could even
+ start with slot1->page_no */
+ n_rows = 10;
+ }
+
+ return(n_rows);
}
/*******************************************************************//**
@@ -3164,6 +3434,7 @@ btr_estimate_n_rows_in_range(
ibool diverged_lot;
ulint divergence_level;
ib_int64_t n_rows;
+ ibool is_n_rows_exact;
ulint i;
mtr_t mtr;
@@ -3206,6 +3477,7 @@ btr_estimate_n_rows_in_range(
/* We have the path information for the range in path1 and path2 */
n_rows = 1;
+ is_n_rows_exact = TRUE;
diverged = FALSE; /* This becomes true when the path is not
the same any more */
diverged_lot = FALSE; /* This becomes true when the paths are
@@ -3221,7 +3493,7 @@ btr_estimate_n_rows_in_range(
if (slot1->nth_rec == ULINT_UNDEFINED
|| slot2->nth_rec == ULINT_UNDEFINED) {
- if (i > divergence_level + 1) {
+ if (i > divergence_level + 1 && !is_n_rows_exact) {
/* In trees whose height is > 1 our algorithm
tends to underestimate: multiply the estimate
by 2: */
@@ -3233,7 +3505,9 @@ btr_estimate_n_rows_in_range(
to over 1 / 2 of the estimated rows in the whole
table */
- if (n_rows > index->table->stat_n_rows / 2) {
+ if (n_rows > index->table->stat_n_rows / 2
+ && !is_n_rows_exact) {
+
n_rows = index->table->stat_n_rows / 2;
/* If there are just 0 or 1 rows in the table,
@@ -3259,10 +3533,15 @@ btr_estimate_n_rows_in_range(
divergence_level = i;
}
} else {
- /* Maybe the tree has changed between
- searches */
-
- return(10);
+ /* It is possible that
+ slot1->nth_rec >= slot2->nth_rec
+ if, for example, we have a single page
+ tree which contains (inf, 5, 6, supr)
+ and we select where x > 20 and x < 30;
+ in this case slot1->nth_rec will point
+ to the supr record and slot2->nth_rec
+ will point to 6 */
+ n_rows = 0;
}
} else if (diverged && !diverged_lot) {
@@ -3286,8 +3565,9 @@ btr_estimate_n_rows_in_range(
}
} else if (diverged_lot) {
- n_rows = (n_rows * (slot1->n_recs + slot2->n_recs))
- / 2;
+ n_rows = btr_estimate_n_rows_in_range_on_level(
+ index, slot1, slot2, n_rows,
+ &is_n_rows_exact);
}
}
}
@@ -3679,6 +3959,8 @@ btr_cur_set_ownership_of_extern_field(
} else {
mach_write_to_1(data + local_len + BTR_EXTERN_LEN, byte_val);
}
+
+ btr_blob_dbg_owner(rec, index, offsets, i, val);
}
/*******************************************************************//**
@@ -3883,7 +4165,7 @@ btr_blob_free(
&& buf_block_get_space(block) == space
&& buf_block_get_page_no(block) == page_no) {
- if (buf_LRU_free_block(&block->page, all, TRUE) != BUF_LRU_FREED
+ if (!buf_LRU_free_block(&block->page, all, TRUE)
&& all && block->page.zip.data
/* Now, buf_LRU_free_block() may release mutex temporarily */
&& buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE
@@ -4184,6 +4466,11 @@ btr_store_big_rec_extern_fields_func(
}
if (prev_page_no == FIL_NULL) {
+ btr_blob_dbg_add_blob(
+ rec, big_rec_vec->fields[i]
+ .field_no, page_no, index,
+ "store");
+
mach_write_to_4(field_ref
+ BTR_EXTERN_SPACE_ID,
space_id);
@@ -4259,6 +4546,11 @@ next_zip_page:
MLOG_4BYTES, &mtr);
if (prev_page_no == FIL_NULL) {
+ btr_blob_dbg_add_blob(
+ rec, big_rec_vec->fields[i]
+ .field_no, page_no, index,
+ "store");
+
mlog_write_ulint(field_ref
+ BTR_EXTERN_SPACE_ID,
space_id,
@@ -4427,6 +4719,37 @@ btr_free_externally_stored_field(
rec_zip_size = 0;
}
+#ifdef UNIV_BLOB_DEBUG
+ if (!(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG)
+ && !((field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_INHERITED_FLAG)
+ && (rb_ctx == RB_NORMAL || rb_ctx == RB_RECOVERY))) {
+ /* This off-page column will be freed.
+ Check that no references remain. */
+
+ btr_blob_dbg_t b;
+
+ b.blob_page_no = mach_read_from_4(
+ field_ref + BTR_EXTERN_PAGE_NO);
+
+ if (rec) {
+ /* Remove the reference from the record to the
+ BLOB. If the BLOB were not freed, the
+ reference would be removed when the record is
+ removed. Freeing the BLOB will overwrite the
+ BTR_EXTERN_PAGE_NO in the field_ref of the
+ record with FIL_NULL, which would make the
+ btr_blob_dbg information inconsistent with the
+ record. */
+ b.ref_page_no = page_get_page_no(page_align(rec));
+ b.ref_heap_no = page_rec_get_heap_no(rec);
+ b.ref_field_no = i;
+ btr_blob_dbg_rbt_delete(index, &b, "free");
+ }
+
+ btr_blob_dbg_assert_empty(index, b.blob_page_no);
+ }
+#endif /* UNIV_BLOB_DEBUG */
+
for (;;) {
#ifdef UNIV_SYNC_DEBUG
buf_block_t* rec_block;
@@ -4673,27 +4996,45 @@ btr_copy_blob_prefix(
/*******************************************************************//**
Copies the prefix of a compressed BLOB. The clustered index record
-that points to this BLOB must be protected by a lock or a page latch. */
+that points to this BLOB must be protected by a lock or a page latch.
+@return number of bytes written to buf */
static
-void
+ulint
btr_copy_zblob_prefix(
/*==================*/
- z_stream* d_stream,/*!< in/out: the decompressing stream */
+ byte* buf, /*!< out: the externally stored part of
+ the field, or a prefix of it */
+ ulint len, /*!< in: length of buf, in bytes */
ulint zip_size,/*!< in: compressed BLOB page size */
ulint space_id,/*!< in: space id of the BLOB pages */
ulint page_no,/*!< in: page number of the first BLOB page */
ulint offset) /*!< in: offset on the first BLOB page */
{
- ulint page_type = FIL_PAGE_TYPE_ZBLOB;
+ ulint page_type = FIL_PAGE_TYPE_ZBLOB;
+ mem_heap_t* heap;
+ int err;
+ z_stream d_stream;
+
+ d_stream.next_out = buf;
+ d_stream.avail_out = len;
+ d_stream.next_in = Z_NULL;
+ d_stream.avail_in = 0;
+
+ /* Zlib inflate needs 32 kilobytes for the default
+ window size, plus a few kilobytes for small objects. */
+ heap = mem_heap_create(40000);
+ page_zip_set_alloc(&d_stream, heap);
ut_ad(ut_is_2pow(zip_size));
ut_ad(zip_size >= PAGE_ZIP_MIN_SIZE);
ut_ad(zip_size <= UNIV_PAGE_SIZE);
ut_ad(space_id);
+ err = inflateInit(&d_stream);
+ ut_a(err == Z_OK);
+
for (;;) {
buf_page_t* bpage;
- int err;
ulint next_page_no;
/* There is no latch on bpage directly. Instead,
@@ -4709,7 +5050,7 @@ btr_copy_zblob_prefix(
" compressed BLOB"
" page %lu space %lu\n",
(ulong) page_no, (ulong) space_id);
- return;
+ goto func_exit;
}
if (UNIV_UNLIKELY
@@ -4735,13 +5076,13 @@ btr_copy_zblob_prefix(
offset += 4;
}
- d_stream->next_in = bpage->zip.data + offset;
- d_stream->avail_in = zip_size - offset;
+ d_stream.next_in = bpage->zip.data + offset;
+ d_stream.avail_in = zip_size - offset;
- err = inflate(d_stream, Z_NO_FLUSH);
+ err = inflate(&d_stream, Z_NO_FLUSH);
switch (err) {
case Z_OK:
- if (!d_stream->avail_out) {
+ if (!d_stream.avail_out) {
goto end_of_blob;
}
break;
@@ -4758,13 +5099,13 @@ inflate_error:
" compressed BLOB"
" page %lu space %lu returned %d (%s)\n",
(ulong) page_no, (ulong) space_id,
- err, d_stream->msg);
+ err, d_stream.msg);
case Z_BUF_ERROR:
goto end_of_blob;
}
if (next_page_no == FIL_NULL) {
- if (!d_stream->avail_in) {
+ if (!d_stream.avail_in) {
ut_print_timestamp(stderr);
fprintf(stderr,
" InnoDB: unexpected end of"
@@ -4773,7 +5114,7 @@ inflate_error:
(ulong) page_no,
(ulong) space_id);
} else {
- err = inflate(d_stream, Z_FINISH);
+ err = inflate(&d_stream, Z_FINISH);
switch (err) {
case Z_STREAM_END:
case Z_BUF_ERROR:
@@ -4785,7 +5126,7 @@ inflate_error:
end_of_blob:
buf_page_release_zip(bpage);
- return;
+ goto func_exit;
}
buf_page_release_zip(bpage);
@@ -4797,6 +5138,12 @@ end_of_blob:
offset = FIL_PAGE_NEXT;
page_type = FIL_PAGE_TYPE_ZBLOB2;
}
+
+func_exit:
+ inflateEnd(&d_stream);
+ mem_heap_free(heap);
+ UNIV_MEM_ASSERT_RW(buf, d_stream.total_out);
+ return(d_stream.total_out);
}
/*******************************************************************//**
@@ -4822,28 +5169,8 @@ btr_copy_externally_stored_field_prefix_low(
}
if (UNIV_UNLIKELY(zip_size)) {
- int err;
- z_stream d_stream;
- mem_heap_t* heap;
-
- /* Zlib inflate needs 32 kilobytes for the default
- window size, plus a few kilobytes for small objects. */
- heap = mem_heap_create(40000);
- page_zip_set_alloc(&d_stream, heap);
-
- err = inflateInit(&d_stream);
- ut_a(err == Z_OK);
-
- d_stream.next_out = buf;
- d_stream.avail_out = len;
- d_stream.avail_in = 0;
-
- btr_copy_zblob_prefix(&d_stream, zip_size,
- space_id, page_no, offset);
- inflateEnd(&d_stream);
- mem_heap_free(heap);
- UNIV_MEM_ASSERT_RW(buf, d_stream.total_out);
- return(d_stream.total_out);
+ return(btr_copy_zblob_prefix(buf, len, zip_size,
+ space_id, page_no, offset));
} else {
return(btr_copy_blob_prefix(buf, len, space_id,
page_no, offset));
diff --git a/storage/xtradb/btr/btr0pcur.c b/storage/xtradb/btr/btr0pcur.c
index f95a5487c94..97fe06f0f5e 100644
--- a/storage/xtradb/btr/btr0pcur.c
+++ b/storage/xtradb/btr/btr0pcur.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -362,33 +362,6 @@ btr_pcur_restore_position_func(
return(FALSE);
}
-/**************************************************************//**
-If the latch mode of the cursor is BTR_LEAF_SEARCH or BTR_LEAF_MODIFY,
-releases the page latch and bufferfix reserved by the cursor.
-NOTE! In the case of BTR_LEAF_MODIFY, there should not exist changes
-made by the current mini-transaction to the data protected by the
-cursor latch, as then the latch must not be released until mtr_commit. */
-UNIV_INTERN
-void
-btr_pcur_release_leaf(
-/*==================*/
- btr_pcur_t* cursor, /*!< in: persistent cursor */
- mtr_t* mtr) /*!< in: mtr */
-{
- buf_block_t* block;
-
- ut_a(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
- ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
-
- block = btr_pcur_get_block(cursor);
-
- btr_leaf_page_release(block, cursor->latch_mode, mtr);
-
- cursor->latch_mode = BTR_NO_LATCHES;
-
- cursor->pos_state = BTR_PCUR_WAS_POSITIONED;
-}
-
/*********************************************************//**
Moves the persistent cursor to the first record on the next page. Releases the
latch on the current page, and bufferunfixes it. Note that there must not be
diff --git a/storage/xtradb/btr/btr0sea.c b/storage/xtradb/btr/btr0sea.c
index 3b38e2799c2..6e6c533f4af 100644
--- a/storage/xtradb/btr/btr0sea.c
+++ b/storage/xtradb/btr/btr0sea.c
@@ -1373,8 +1373,8 @@ btr_search_drop_page_hash_when_freed(
having to fear a deadlock. */
block = buf_page_get_gen(space, zip_size, page_no, RW_S_LATCH, NULL,
- BUF_GET_IF_IN_POOL, __FILE__, __LINE__,
- &mtr);
+ BUF_PEEK_IF_IN_POOL, __FILE__, __LINE__,
+ &mtr);
/* Because the buffer pool mutex was released by
buf_page_peek_if_search_hashed(), it is possible that the
block was removed from the buffer pool by another thread
diff --git a/storage/xtradb/buf/buf0buddy.c b/storage/xtradb/buf/buf0buddy.c
index db94b4bed24..673d6c55efc 100644
--- a/storage/xtradb/buf/buf0buddy.c
+++ b/storage/xtradb/buf/buf0buddy.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 2006, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 2006, 2011, Oracle and/or its affiliates. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -45,6 +45,14 @@ static ulint buf_buddy_n_frames;
Protected by buf_pool_mutex. */
UNIV_INTERN buf_buddy_stat_t buf_buddy_stat[BUF_BUDDY_SIZES_MAX + 1];
+/** Validate a given zip_free list. */
+#define BUF_BUDDY_LIST_VALIDATE(i) \
+ UT_LIST_VALIDATE(zip_list, buf_page_t, \
+ buf_pool->zip_free[i], \
+ ut_ad(buf_page_get_state( \
+ ut_list_node_313) \
+ == BUF_BLOCK_ZIP_FREE))
+
/**********************************************************************//**
Get the offset of the buddy of a compressed page frame.
@return the buddy relative of page */
@@ -76,22 +84,11 @@ buf_buddy_add_to_free(
buf_page_t* bpage, /*!< in,own: block to be freed */
ulint i) /*!< in: index of buf_pool->zip_free[] */
{
-#ifdef UNIV_DEBUG_VALGRIND
- buf_page_t* b = UT_LIST_GET_FIRST(buf_pool->zip_free[i]);
-
- if (b) UNIV_MEM_VALID(b, BUF_BUDDY_LOW << i);
-#endif /* UNIV_DEBUG_VALGRIND */
-
//ut_ad(buf_pool_mutex_own());
ut_ad(mutex_own(&zip_free_mutex));
ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_FREE);
ut_ad(buf_pool->zip_free[i].start != bpage);
UT_LIST_ADD_FIRST(zip_list, buf_pool->zip_free[i], bpage);
-
-#ifdef UNIV_DEBUG_VALGRIND
- if (b) UNIV_MEM_FREE(b, BUF_BUDDY_LOW << i);
- UNIV_MEM_ASSERT_AND_FREE(bpage, BUF_BUDDY_LOW << i);
-#endif /* UNIV_DEBUG_VALGRIND */
}
/**********************************************************************//**
@@ -103,26 +100,18 @@ buf_buddy_remove_from_free(
buf_page_t* bpage, /*!< in: block to be removed */
ulint i) /*!< in: index of buf_pool->zip_free[] */
{
-#ifdef UNIV_DEBUG_VALGRIND
+#ifdef UNIV_DEBUG
buf_page_t* prev = UT_LIST_GET_PREV(zip_list, bpage);
buf_page_t* next = UT_LIST_GET_NEXT(zip_list, bpage);
- if (prev) UNIV_MEM_VALID(prev, BUF_BUDDY_LOW << i);
- if (next) UNIV_MEM_VALID(next, BUF_BUDDY_LOW << i);
-
ut_ad(!prev || buf_page_get_state(prev) == BUF_BLOCK_ZIP_FREE);
ut_ad(!next || buf_page_get_state(next) == BUF_BLOCK_ZIP_FREE);
-#endif /* UNIV_DEBUG_VALGRIND */
+#endif /* UNIV_DEBUG */
//ut_ad(buf_pool_mutex_own());
ut_ad(mutex_own(&zip_free_mutex));
ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_FREE);
UT_LIST_REMOVE(zip_list, buf_pool->zip_free[i], bpage);
-
-#ifdef UNIV_DEBUG_VALGRIND
- if (prev) UNIV_MEM_FREE(prev, BUF_BUDDY_LOW << i);
- if (next) UNIV_MEM_FREE(next, BUF_BUDDY_LOW << i);
-#endif /* UNIV_DEBUG_VALGRIND */
}
/**********************************************************************//**
@@ -139,17 +128,13 @@ buf_buddy_alloc_zip(
//ut_ad(buf_pool_mutex_own());
ut_ad(mutex_own(&zip_free_mutex));
ut_a(i < BUF_BUDDY_SIZES);
+ ut_a(i >= buf_buddy_get_slot(PAGE_ZIP_MIN_SIZE));
+
+ ut_d(BUF_BUDDY_LIST_VALIDATE(i));
-#ifndef UNIV_DEBUG_VALGRIND
- /* Valgrind would complain about accessing free memory. */
- ut_d(UT_LIST_VALIDATE(zip_list, buf_page_t, buf_pool->zip_free[i],
- ut_ad(buf_page_get_state(ut_list_node_313)
- == BUF_BLOCK_ZIP_FREE)));
-#endif /* !UNIV_DEBUG_VALGRIND */
bpage = UT_LIST_GET_LAST(buf_pool->zip_free[i]);
if (bpage) {
- UNIV_MEM_VALID(bpage, BUF_BUDDY_LOW << i);
ut_a(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_FREE);
buf_buddy_remove_from_free(bpage, i);
@@ -168,13 +153,10 @@ buf_buddy_alloc_zip(
}
}
-#ifdef UNIV_DEBUG
if (bpage) {
- memset(bpage, ~i, BUF_BUDDY_LOW << i);
+ ut_d(memset(bpage, ~i, BUF_BUDDY_LOW << i));
+ UNIV_MEM_ALLOC(bpage, BUF_BUDDY_SIZES << i);
}
-#endif /* UNIV_DEBUG */
-
- UNIV_MEM_ALLOC(bpage, BUF_BUDDY_SIZES << i);
return(bpage);
}
@@ -266,6 +248,7 @@ buf_buddy_alloc_from(
{
ulint offs = BUF_BUDDY_LOW << j;
ut_ad(j <= BUF_BUDDY_SIZES);
+ ut_ad(i >= buf_buddy_get_slot(PAGE_ZIP_MIN_SIZE));
ut_ad(j >= i);
ut_ad(!ut_align_offset(buf, offs));
@@ -279,13 +262,7 @@ buf_buddy_alloc_from(
bpage = (buf_page_t*) ((byte*) buf + offs);
ut_d(memset(bpage, j, BUF_BUDDY_LOW << j));
bpage->state = BUF_BLOCK_ZIP_FREE;
-#ifndef UNIV_DEBUG_VALGRIND
- /* Valgrind would complain about accessing free memory. */
- ut_d(UT_LIST_VALIDATE(zip_list, buf_page_t, buf_pool->zip_free[i],
- ut_ad(buf_page_get_state(
- ut_list_node_313)
- == BUF_BLOCK_ZIP_FREE)));
-#endif /* !UNIV_DEBUG_VALGRIND */
+ ut_d(BUF_BUDDY_LIST_VALIDATE(i));
buf_buddy_add_to_free(bpage, j);
}
@@ -295,8 +272,8 @@ buf_buddy_alloc_from(
/**********************************************************************//**
Allocate a block. The thread calling this function must hold
buf_pool_mutex and must not hold buf_pool_zip_mutex or any block->mutex.
-The buf_pool_mutex may only be released and reacquired if lru != NULL.
-@return allocated block, possibly NULL if lru==NULL */
+The buf_pool_mutex may be released and reacquired.
+@return allocated block, never NULL */
UNIV_INTERN
void*
buf_buddy_alloc_low(
@@ -305,14 +282,15 @@ buf_buddy_alloc_low(
or BUF_BUDDY_SIZES */
ibool* lru, /*!< in: pointer to a variable that will be assigned
TRUE if storage was allocated from the LRU list
- and buf_pool_mutex was temporarily released,
- or NULL if the LRU list should not be used */
+ and buf_pool_mutex was temporarily released */
ibool have_page_hash_mutex)
{
buf_block_t* block;
+ ut_ad(lru);
//ut_ad(buf_pool_mutex_own());
ut_ad(!mutex_own(&buf_pool_zip_mutex));
+ ut_ad(i >= buf_buddy_get_slot(PAGE_ZIP_MIN_SIZE));
if (i < BUF_BUDDY_SIZES) {
/* Try to allocate from the buddy system. */
@@ -335,11 +313,6 @@ buf_buddy_alloc_low(
goto alloc_big;
}
- if (!lru) {
-
- return(NULL);
- }
-
/* Try replacing an uncompressed page in the buffer pool. */
//buf_pool_mutex_exit();
mutex_exit(&LRU_list_mutex);
@@ -368,76 +341,6 @@ func_exit:
}
/**********************************************************************//**
-Try to relocate the control block of a compressed page.
-@return TRUE if relocated */
-static
-ibool
-buf_buddy_relocate_block(
-/*=====================*/
- buf_page_t* bpage, /*!< in: block to relocate */
- buf_page_t* dpage) /*!< in: free block to relocate to */
-{
- buf_page_t* b;
-
- //ut_ad(buf_pool_mutex_own());
-#ifdef UNIV_SYNC_DEBUG
- ut_ad(rw_lock_own(&page_hash_latch, RW_LOCK_EX));
-#endif
-
- switch (buf_page_get_state(bpage)) {
- case BUF_BLOCK_ZIP_FREE:
- case BUF_BLOCK_NOT_USED:
- case BUF_BLOCK_READY_FOR_USE:
- case BUF_BLOCK_FILE_PAGE:
- case BUF_BLOCK_MEMORY:
- case BUF_BLOCK_REMOVE_HASH:
- /* ut_error; */ /* optimistic */
- case BUF_BLOCK_ZIP_DIRTY:
- /* Cannot relocate dirty pages. */
- return(FALSE);
-
- case BUF_BLOCK_ZIP_PAGE:
- break;
- }
-
- mutex_enter(&buf_pool_zip_mutex);
- mutex_enter(&zip_free_mutex);
-
- if (!buf_page_can_relocate(bpage)) {
- mutex_exit(&buf_pool_zip_mutex);
- mutex_exit(&zip_free_mutex);
- return(FALSE);
- }
-
- if (bpage != buf_page_hash_get(bpage->space, bpage->offset)) {
- mutex_exit(&buf_pool_zip_mutex);
- mutex_exit(&zip_free_mutex);
- return(FALSE);
- }
-
- buf_relocate(bpage, dpage);
- ut_d(bpage->state = BUF_BLOCK_ZIP_FREE);
-
- /* relocate buf_pool->zip_clean */
- mutex_enter(&flush_list_mutex);
- b = UT_LIST_GET_PREV(zip_list, dpage);
- UT_LIST_REMOVE(zip_list, buf_pool->zip_clean, dpage);
-
- if (b) {
- UT_LIST_INSERT_AFTER(zip_list, buf_pool->zip_clean, b, dpage);
- } else {
- UT_LIST_ADD_FIRST(zip_list, buf_pool->zip_clean, dpage);
- }
- mutex_exit(&flush_list_mutex);
-
- UNIV_MEM_INVALID(bpage, sizeof *bpage);
-
- mutex_exit(&buf_pool_zip_mutex);
- mutex_exit(&zip_free_mutex);
- return(TRUE);
-}
-
-/**********************************************************************//**
Try to relocate a block.
@return TRUE if relocated */
static
@@ -452,159 +355,120 @@ buf_buddy_relocate(
buf_page_t* bpage;
const ulint size = BUF_BUDDY_LOW << i;
ullint usec = ut_time_us(NULL);
+ mutex_t* mutex;
+ ulint space;
+ ulint page_no;
//ut_ad(buf_pool_mutex_own());
ut_ad(mutex_own(&zip_free_mutex));
ut_ad(!mutex_own(&buf_pool_zip_mutex));
ut_ad(!ut_align_offset(src, size));
ut_ad(!ut_align_offset(dst, size));
+ ut_ad(i >= buf_buddy_get_slot(PAGE_ZIP_MIN_SIZE));
UNIV_MEM_ASSERT_W(dst, size);
+ if (!have_page_hash_mutex) {
+ mutex_exit(&zip_free_mutex);
+ mutex_enter(&LRU_list_mutex);
+ rw_lock_x_lock(&page_hash_latch);
+ }
+
/* We assume that all memory from buf_buddy_alloc()
- is used for either compressed pages or buf_page_t
- objects covering compressed pages. */
+ is used for compressed page frames. */
/* We look inside the allocated objects returned by
- buf_buddy_alloc() and assume that anything of
- PAGE_ZIP_MIN_SIZE or larger is a compressed page that contains
- a valid space_id and page_no in the page header. Should the
- fields be invalid, we will be unable to relocate the block.
- We also assume that anything that fits sizeof(buf_page_t)
- actually is a properly initialized buf_page_t object. */
-
- if (size >= PAGE_ZIP_MIN_SIZE) {
- /* This is a compressed page. */
- mutex_t* mutex;
- ulint space, page_no;
+ buf_buddy_alloc() and assume that each block is a compressed
+ page that contains a valid space_id and page_no in the page
+ header. Should the fields be invalid, we will be unable to
+ relocate the block. */
+
+ /* The src block may be split into smaller blocks,
+ some of which may be free. Thus, the
+ mach_read_from_4() calls below may attempt to read
+ from free memory. The memory is "owned" by the buddy
+ allocator (and it has been allocated from the buffer
+ pool), so there is nothing wrong about this. The
+ mach_read_from_4() calls here will only trigger bogus
+ Valgrind memcheck warnings in UNIV_DEBUG_VALGRIND builds. */
+ space = mach_read_from_4((const byte *) src
+ + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+ page_no = mach_read_from_4((const byte *) src
+ + FIL_PAGE_OFFSET);
+ /* Suppress Valgrind warnings about conditional jump
+ on uninitialized value. */
+ UNIV_MEM_VALID(&space, sizeof space);
+ UNIV_MEM_VALID(&page_no, sizeof page_no);
+ bpage = buf_page_hash_get(space, page_no);
+
+ if (!bpage || bpage->zip.data != src) {
+ /* The block has probably been freshly
+ allocated by buf_LRU_get_free_block() but not
+ added to buf_pool->page_hash yet. Obviously,
+ it cannot be relocated. */
if (!have_page_hash_mutex) {
- mutex_exit(&zip_free_mutex);
- mutex_enter(&LRU_list_mutex);
- rw_lock_x_lock(&page_hash_latch);
- }
-
- /* The src block may be split into smaller blocks,
- some of which may be free. Thus, the
- mach_read_from_4() calls below may attempt to read
- from free memory. The memory is "owned" by the buddy
- allocator (and it has been allocated from the buffer
- pool), so there is nothing wrong about this. The
- mach_read_from_4() calls here will only trigger bogus
- Valgrind memcheck warnings in UNIV_DEBUG_VALGRIND builds. */
- space = mach_read_from_4(
- (const byte*) src + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
- page_no = mach_read_from_4(
- (const byte*) src + FIL_PAGE_OFFSET);
- /* Suppress Valgrind warnings about conditional jump
- on uninitialized value. */
- UNIV_MEM_VALID(&space, sizeof space);
- UNIV_MEM_VALID(&page_no, sizeof page_no);
- bpage = buf_page_hash_get(space, page_no);
-
- if (!bpage || bpage->zip.data != src) {
- /* The block has probably been freshly
- allocated by buf_LRU_get_free_block() but not
- added to buf_pool->page_hash yet. Obviously,
- it cannot be relocated. */
-
- if (!have_page_hash_mutex) {
- mutex_enter(&zip_free_mutex);
- mutex_exit(&LRU_list_mutex);
- rw_lock_x_unlock(&page_hash_latch);
- }
- return(FALSE);
- }
-
- if (page_zip_get_size(&bpage->zip) != size) {
- /* The block is of different size. We would
- have to relocate all blocks covered by src.
- For the sake of simplicity, give up. */
- ut_ad(page_zip_get_size(&bpage->zip) < size);
-
- if (!have_page_hash_mutex) {
- mutex_enter(&zip_free_mutex);
- mutex_exit(&LRU_list_mutex);
- rw_lock_x_unlock(&page_hash_latch);
- }
- return(FALSE);
+ mutex_enter(&zip_free_mutex);
+ mutex_exit(&LRU_list_mutex);
+ rw_lock_x_unlock(&page_hash_latch);
}
+ return(FALSE);
+ }
- /* To keep latch order */
- if (have_page_hash_mutex)
- mutex_exit(&zip_free_mutex);
-
- /* The block must have been allocated, but it may
- contain uninitialized data. */
- UNIV_MEM_ASSERT_W(src, size);
-
- mutex = buf_page_get_mutex_enter(bpage);
-
- mutex_enter(&zip_free_mutex);
-
- if (mutex && buf_page_can_relocate(bpage)) {
- /* Relocate the compressed page. */
- ut_a(bpage->zip.data == src);
- memcpy(dst, src, size);
- bpage->zip.data = dst;
- mutex_exit(mutex);
-success:
- UNIV_MEM_INVALID(src, size);
- {
- buf_buddy_stat_t* buddy_stat
- = &buf_buddy_stat[i];
- buddy_stat->relocated++;
- buddy_stat->relocated_usec
- += ut_time_us(NULL) - usec;
- }
-
- if (!have_page_hash_mutex) {
- mutex_exit(&LRU_list_mutex);
- rw_lock_x_unlock(&page_hash_latch);
- }
- return(TRUE);
- }
+ if (page_zip_get_size(&bpage->zip) != size) {
+ /* The block is of different size. We would
+ have to relocate all blocks covered by src.
+ For the sake of simplicity, give up. */
+ ut_ad(page_zip_get_size(&bpage->zip) < size);
if (!have_page_hash_mutex) {
+ mutex_enter(&zip_free_mutex);
mutex_exit(&LRU_list_mutex);
rw_lock_x_unlock(&page_hash_latch);
}
+ return(FALSE);
+ }
- if (mutex) {
- mutex_exit(mutex);
- }
- } else if (i == buf_buddy_get_slot(sizeof(buf_page_t))) {
- /* This must be a buf_page_t object. */
-#if UNIV_WORD_SIZE == 4
- /* On 32-bit systems, there is no padding in
- buf_page_t. On other systems, Valgrind could complain
- about uninitialized pad bytes. */
- UNIV_MEM_ASSERT_RW(src, size);
-#endif
-
+ /* To keep latch order */
+ if (have_page_hash_mutex)
mutex_exit(&zip_free_mutex);
- if (!have_page_hash_mutex) {
- mutex_enter(&LRU_list_mutex);
- rw_lock_x_lock(&page_hash_latch);
- }
+ /* The block must have been allocated, but it may
+ contain uninitialized data. */
+ UNIV_MEM_ASSERT_W(src, size);
- if (buf_buddy_relocate_block(src, dst)) {
- mutex_enter(&zip_free_mutex);
+ mutex = buf_page_get_mutex_enter(bpage);
- if (!have_page_hash_mutex) {
- mutex_exit(&LRU_list_mutex);
- rw_lock_x_unlock(&page_hash_latch);
- }
+ mutex_enter(&zip_free_mutex);
- goto success;
+ if (mutex && buf_page_can_relocate(bpage)) {
+ /* Relocate the compressed page. */
+ ut_a(bpage->zip.data == src);
+ memcpy(dst, src, size);
+ bpage->zip.data = dst;
+ mutex_exit(mutex);
+ UNIV_MEM_INVALID(src, size);
+ {
+ buf_buddy_stat_t* buddy_stat
+ = &buf_buddy_stat[i];
+ buddy_stat->relocated++;
+ buddy_stat->relocated_usec
+ += ut_time_us(NULL) - usec;
}
- mutex_enter(&zip_free_mutex);
-
if (!have_page_hash_mutex) {
mutex_exit(&LRU_list_mutex);
rw_lock_x_unlock(&page_hash_latch);
}
+ return(TRUE);
+ }
+
+ if (!have_page_hash_mutex) {
+ mutex_exit(&LRU_list_mutex);
+ rw_lock_x_unlock(&page_hash_latch);
+ }
+
+ if (mutex) {
+ mutex_exit(mutex);
}
return(FALSE);
@@ -629,12 +493,14 @@ buf_buddy_free_low(
ut_ad(mutex_own(&zip_free_mutex));
ut_ad(!mutex_own(&buf_pool_zip_mutex));
ut_ad(i <= BUF_BUDDY_SIZES);
+ ut_ad(i >= buf_buddy_get_slot(PAGE_ZIP_MIN_SIZE));
ut_ad(buf_buddy_stat[i].used > 0);
buf_buddy_stat[i].used--;
+
recombine:
UNIV_MEM_ASSERT_AND_ALLOC(buf, BUF_BUDDY_LOW << i);
- ut_d(((buf_page_t*) buf)->state = BUF_BLOCK_ZIP_FREE);
+ ((buf_page_t*) buf)->state = BUF_BLOCK_ZIP_FREE;
if (i == BUF_BUDDY_SIZES) {
mutex_exit(&zip_free_mutex);
@@ -647,32 +513,36 @@ recombine:
ut_ad(buf == ut_align_down(buf, BUF_BUDDY_LOW << i));
ut_ad(!buf_pool_contains_zip(buf));
- /* Try to combine adjacent blocks. */
+ /* Do not recombine blocks if there are few free blocks.
+ We may waste up to 15360*max_len bytes to free blocks
+ (1024 + 2048 + 4096 + 8192 = 15360) */
+ if (UT_LIST_GET_LEN(buf_pool->zip_free[i]) < 16) {
+ goto func_exit;
+ }
+ /* Try to combine adjacent blocks. */
buddy = (buf_page_t*) buf_buddy_get(((byte*) buf), BUF_BUDDY_LOW << i);
#ifndef UNIV_DEBUG_VALGRIND
- /* Valgrind would complain about accessing free memory. */
+ /* When Valgrind instrumentation is not enabled, we can read
+ buddy->state to quickly determine that a block is not free.
+ When the block is not free, buddy->state belongs to a compressed
+ page frame that may be flagged uninitialized in our Valgrind
+ instrumentation. */
if (buddy->state != BUF_BLOCK_ZIP_FREE) {
goto buddy_nonfree;
}
-
- /* The field buddy->state can only be trusted for free blocks.
- If buddy->state == BUF_BLOCK_ZIP_FREE, the block is free if
- it is in the free list. */
#endif /* !UNIV_DEBUG_VALGRIND */
for (bpage = UT_LIST_GET_FIRST(buf_pool->zip_free[i]); bpage; ) {
- UNIV_MEM_VALID(bpage, BUF_BUDDY_LOW << i);
ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_FREE);
if (bpage == buddy) {
-buddy_free:
/* The buddy is free: recombine */
buf_buddy_remove_from_free(bpage, i);
-buddy_free2:
+buddy_is_free:
ut_ad(buf_page_get_state(buddy) == BUF_BLOCK_ZIP_FREE);
ut_ad(!buf_pool_contains_zip(buddy));
i++;
@@ -682,122 +552,43 @@ buddy_free2:
}
ut_a(bpage != buf);
-
- {
- buf_page_t* next = UT_LIST_GET_NEXT(zip_list, bpage);
- UNIV_MEM_ASSERT_AND_FREE(bpage, BUF_BUDDY_LOW << i);
- bpage = next;
- }
+ UNIV_MEM_ASSERT_W(bpage, BUF_BUDDY_LOW << i);
+ bpage = UT_LIST_GET_NEXT(zip_list, bpage);
}
#ifndef UNIV_DEBUG_VALGRIND
buddy_nonfree:
- /* Valgrind would complain about accessing free memory. */
- ut_d(UT_LIST_VALIDATE(zip_list, buf_page_t, buf_pool->zip_free[i],
- ut_ad(buf_page_get_state(ut_list_node_313)
- == BUF_BLOCK_ZIP_FREE)));
-#endif /* UNIV_DEBUG_VALGRIND */
+#endif /* !UNIV_DEBUG_VALGRIND */
+
+ ut_d(BUF_BUDDY_LIST_VALIDATE(i));
/* The buddy is not free. Is there a free block of this size? */
bpage = UT_LIST_GET_LAST(buf_pool->zip_free[i]);
if (bpage) {
+
/* Remove the block from the free list, because a successful
buf_buddy_relocate() will overwrite bpage->list. */
-
- UNIV_MEM_VALID(bpage, BUF_BUDDY_LOW << i);
buf_buddy_remove_from_free(bpage, i);
/* Try to relocate the buddy of buf to the free block. */
if (buf_buddy_relocate(buddy, bpage, i, have_page_hash_mutex)) {
- ut_d(buddy->state = BUF_BLOCK_ZIP_FREE);
- goto buddy_free2;
+ buddy->state = BUF_BLOCK_ZIP_FREE;
+ goto buddy_is_free;
}
buf_buddy_add_to_free(bpage, i);
-
- /* Try to relocate the buddy of the free block to buf. */
- buddy = (buf_page_t*) buf_buddy_get(((byte*) bpage),
- BUF_BUDDY_LOW << i);
-
-#ifndef UNIV_DEBUG_VALGRIND
- /* Valgrind would complain about accessing free memory. */
-
- /* The buddy must not be (completely) free, because we
- always recombine adjacent free blocks.
-
- (Parts of the buddy can be free in
- buf_pool->zip_free[j] with j < i.) */
- ut_d(UT_LIST_VALIDATE(zip_list, buf_page_t, buf_pool->zip_free[i],
- ut_ad(buf_page_get_state(
- ut_list_node_313)
- == BUF_BLOCK_ZIP_FREE
- && ut_list_node_313 != buddy)));
-#endif /* !UNIV_DEBUG_VALGRIND */
-
- if (buf_buddy_relocate(buddy, buf, i, have_page_hash_mutex)) {
-
- buf = bpage;
- UNIV_MEM_VALID(bpage, BUF_BUDDY_LOW << i);
- ut_d(buddy->state = BUF_BLOCK_ZIP_FREE);
- goto buddy_free;
- }
}
+func_exit:
/* Free the block to the buddy list. */
bpage = buf;
-#ifdef UNIV_DEBUG
- if (i < buf_buddy_get_slot(PAGE_ZIP_MIN_SIZE)) {
- /* This area has most likely been allocated for at
- least one compressed-only block descriptor. Check
- that there are no live objects in the area. This is
- not a complete check: it may yield false positives as
- well as false negatives. Also, due to buddy blocks
- being recombined, it is possible (although unlikely)
- that this branch is never reached. */
-
- char* c;
-
-# ifndef UNIV_DEBUG_VALGRIND
- /* Valgrind would complain about accessing
- uninitialized memory. Besides, Valgrind performs a
- more exhaustive check, at every memory access. */
- const buf_page_t* b = buf;
- const buf_page_t* const b_end = (buf_page_t*)
- ((char*) b + (BUF_BUDDY_LOW << i));
-
- for (; b < b_end; b++) {
- /* Avoid false positives (and cause false
- negatives) by checking for b->space < 1000. */
-
- if ((b->state == BUF_BLOCK_ZIP_PAGE
- || b->state == BUF_BLOCK_ZIP_DIRTY)
- && b->space > 0 && b->space < 1000) {
- fprintf(stderr,
- "buddy dirty %p %u (%u,%u) %p,%lu\n",
- (void*) b,
- b->state, b->space, b->offset,
- buf, i);
- }
- }
-# endif /* !UNIV_DEBUG_VALGRIND */
-
- /* Scramble the block. This should make any pointers
- invalid and trigger a segmentation violation. Because
- the scrambling can be reversed, it may be possible to
- track down the object pointing to the freed data by
- dereferencing the unscrambled bpage->LRU or
- bpage->list pointers. */
- for (c = (char*) buf + (BUF_BUDDY_LOW << i);
- c-- > (char*) buf; ) {
- *c = ~*c ^ i;
- }
- } else {
- /* Fill large blocks with a constant pattern. */
- memset(bpage, i, BUF_BUDDY_LOW << i);
- }
-#endif /* UNIV_DEBUG */
+
+ /* Fill large blocks with a constant pattern. */
+ ut_d(memset(bpage, i, BUF_BUDDY_LOW << i));
+ UNIV_MEM_INVALID(bpage, BUF_BUDDY_LOW << i);
+
bpage->state = BUF_BLOCK_ZIP_FREE;
buf_buddy_add_to_free(bpage, i);
}
diff --git a/storage/xtradb/buf/buf0buf.c b/storage/xtradb/buf/buf0buf.c
index 40a907b4199..c76973a42ef 100644
--- a/storage/xtradb/buf/buf0buf.c
+++ b/storage/xtradb/buf/buf0buf.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2008, Google Inc.
Portions of this file contain modifications contributed and copyrighted by
@@ -53,10 +53,6 @@ Created 11/5/1995 Heikki Tuuri
#include "page0zip.h"
#include "trx0trx.h"
#include "srv0start.h"
-#include "que0que.h"
-#include "read0read.h"
-#include "row0row.h"
-#include "ha_prototypes.h"
/* prototypes for new functions added to ha_innodb.cc */
trx_t* innobase_get_trx();
@@ -314,30 +310,6 @@ read-ahead or flush occurs */
UNIV_INTERN ibool buf_debug_prints = FALSE;
#endif /* UNIV_DEBUG */
-/* Buffer pool shared memory segment information */
-typedef struct buf_shm_info_struct buf_shm_info_t;
-
-struct buf_shm_info_struct {
- char head_str[8];
- ulint binary_id;
- ibool is_new; /* during initializing */
- ibool clean; /* clean shutdowned and free */
- ibool reusable; /* reusable */
- ulint buf_pool_size; /* backup value */
- ulint page_size; /* backup value */
- ulint frame_offset; /* offset of the first frame based on chunk->mem */
- ulint zip_hash_offset;
- ulint zip_hash_n;
-
- ulint checksum;
-
- buf_pool_t buf_pool_backup;
- buf_chunk_t chunk_backup;
-
- ib_uint64_t dummy;
-};
-
-#define BUF_SHM_INFO_HEAD "XTRA_SHM"
#endif /* !UNIV_HOTBACKUP */
/********************************************************************//**
@@ -767,6 +739,10 @@ buf_block_init(
block->page.in_flush_list = FALSE;
block->page.in_free_list = FALSE;
#endif /* UNIV_DEBUG */
+ block->page.flush_list.prev = NULL;
+ block->page.flush_list.next = NULL;
+ block->page.zip_list.prev = NULL;
+ block->page.zip_list.next = NULL;
block->page.in_LRU_list = FALSE;
block->in_unzip_LRU_list = FALSE;
#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
@@ -784,45 +760,6 @@ buf_block_init(
#endif /* UNIV_SYNC_DEBUG */
}
-static
-void
-buf_block_reuse(
-/*============*/
- buf_block_t* block,
- ptrdiff_t frame_offset)
-{
- /* block_init */
- block->frame += frame_offset;
-
- UNIV_MEM_DESC(block->frame, UNIV_PAGE_SIZE, block);
-
- block->index = NULL;
-
-#ifdef UNIV_DEBUG
- /* recreate later */
- block->page.in_page_hash = FALSE;
- block->page.in_zip_hash = FALSE;
-#endif /* UNIV_DEBUG */
-
-#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
- block->n_pointers = 0;
-#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
-
- if (block->page.zip.data)
- block->page.zip.data += frame_offset;
-
- block->is_hashed = FALSE;
-
- mutex_create(&block->mutex, SYNC_BUF_BLOCK);
-
- rw_lock_create(&block->lock, SYNC_LEVEL_VARYING);
- ut_ad(rw_lock_validate(&(block->lock)));
-
-#ifdef UNIV_SYNC_DEBUG
- rw_lock_create(&block->debug_latch, SYNC_NO_ORDER_CHECK);
-#endif /* UNIV_SYNC_DEBUG */
-}
-
/********************************************************************//**
Allocates a chunk of buffer frames.
@return chunk, or NULL on failure */
@@ -835,190 +772,28 @@ buf_chunk_init(
{
buf_block_t* block;
byte* frame;
- ulint zip_hash_n = 0;
- ulint zip_hash_mem_size = 0;
- hash_table_t* zip_hash_tmp = NULL;
ulint i;
ulint size_target;
- buf_shm_info_t* shm_info = NULL;
/* Round down to a multiple of page size,
although it already should be. */
mem_size = ut_2pow_round(mem_size, UNIV_PAGE_SIZE);
size_target = (mem_size / UNIV_PAGE_SIZE) - 1;
-
- srv_buffer_pool_shm_is_reused = FALSE;
-
- if (srv_buffer_pool_shm_key) {
- /* zip_hash size */
- zip_hash_n = (mem_size / UNIV_PAGE_SIZE) * 2;
- zip_hash_mem_size = ut_2pow_round(hash_create_needed(zip_hash_n)
- + (UNIV_PAGE_SIZE - 1), UNIV_PAGE_SIZE);
- }
-
/* Reserve space for the block descriptors. */
mem_size += ut_2pow_round((mem_size / UNIV_PAGE_SIZE) * (sizeof *block)
+ (UNIV_PAGE_SIZE - 1), UNIV_PAGE_SIZE);
- if (srv_buffer_pool_shm_key) {
- mem_size += ut_2pow_round(sizeof(buf_shm_info_t)
- + (UNIV_PAGE_SIZE - 1), UNIV_PAGE_SIZE);
- mem_size += zip_hash_mem_size;
- }
chunk->mem_size = mem_size;
-
- if (srv_buffer_pool_shm_key) {
- ulint binary_id;
- ibool is_new;
-
- ut_a(buf_pool->n_chunks == 1);
-
- fprintf(stderr,
- "InnoDB: Warning: The innodb_buffer_pool_shm_key option has been specified.\n"
- "InnoDB: Do not change the following between restarts of the server while this option is being used:\n"
- "InnoDB: * the mysqld executable between restarts of the server.\n"
- "InnoDB: * the value of innodb_buffer_pool_size.\n"
- "InnoDB: * the value of innodb_page_size.\n"
- "InnoDB: * datafiles created by InnoDB during this session.\n"
- "InnoDB: Otherwise, data corruption in datafiles may result.\n");
-
- /* FIXME: This is vague id still */
- binary_id = (ulint) ((byte*)mtr_commit - (byte*)btr_root_get)
- + (ulint) ((byte*)os_get_os_version - (byte*)buf_calc_page_new_checksum)
- + (ulint) ((byte*)page_dir_find_owner_slot - (byte*)dfield_data_is_binary_equal)
- + (ulint) ((byte*)que_graph_publish - (byte*)dict_casedn_str)
- + (ulint) ((byte*)read_view_oldest_copy_or_open_new - (byte*)fil_space_get_version)
- + (ulint) ((byte*)rec_get_n_extern_new - (byte*)fsp_get_size_low)
- + (ulint) ((byte*)row_get_trx_id_offset - (byte*)ha_create_func)
- + (ulint) ((byte*)srv_set_io_thread_op_info - (byte*)thd_is_replication_slave_thread)
- + (ulint) ((byte*)mutex_create_func - (byte*)ibuf_inside)
- + (ulint) ((byte*)trx_set_detailed_error - (byte*)lock_check_trx_id_sanity)
- + (ulint) ((byte*)ut_time - (byte*)mem_heap_strdup);
-
- chunk->mem = os_shm_alloc(&chunk->mem_size, srv_buffer_pool_shm_key, &is_new);
-
- if (UNIV_UNLIKELY(chunk->mem == NULL)) {
- return(NULL);
- }
-init_again:
-#ifdef UNIV_SET_MEM_TO_ZERO
- if (is_new) {
- memset(chunk->mem, '\0', chunk->mem_size);
- }
-#endif
- /* for ut_fold_binary_32(), these values should be 32-bit aligned */
- ut_a(sizeof(buf_shm_info_t) % 4 == 0);
- ut_a((ulint)chunk->mem % 4 == 0);
- ut_a(chunk->mem_size % 4 == 0);
-
- shm_info = chunk->mem;
-
- zip_hash_tmp = (hash_table_t*)((byte*)chunk->mem + chunk->mem_size - zip_hash_mem_size);
-
- if (is_new) {
- strncpy(shm_info->head_str, BUF_SHM_INFO_HEAD, 8);
- shm_info->binary_id = binary_id;
- shm_info->is_new = TRUE; /* changed to FALSE when the initialization is finished */
- shm_info->clean = FALSE; /* changed to TRUE when free the segment. */
- shm_info->reusable = FALSE; /* changed to TRUE when validation is finished. */
- shm_info->buf_pool_size = srv_buf_pool_size;
- shm_info->page_size = srv_page_size;
- shm_info->zip_hash_offset = chunk->mem_size - zip_hash_mem_size;
- shm_info->zip_hash_n = zip_hash_n;
- } else {
- ulint checksum;
-
- if (strncmp(shm_info->head_str, BUF_SHM_INFO_HEAD, 8)) {
- fprintf(stderr,
- "InnoDB: Error: The shared memory segment seems not to be for buffer pool.\n");
- return(NULL);
- }
- if (shm_info->binary_id != binary_id) {
- fprintf(stderr,
- "InnoDB: Error: The shared memory segment seems not to be for this binary.\n");
- return(NULL);
- }
- if (shm_info->is_new) {
- fprintf(stderr,
- "InnoDB: Error: The shared memory was not initialized yet.\n");
- return(NULL);
- }
- if (shm_info->buf_pool_size != srv_buf_pool_size) {
- fprintf(stderr,
- "InnoDB: Error: srv_buf_pool_size is different (shm=%lu current=%lu).\n",
- shm_info->buf_pool_size, srv_buf_pool_size);
- return(NULL);
- }
- if (shm_info->page_size != srv_page_size) {
- fprintf(stderr,
- "InnoDB: Error: srv_page_size is different (shm=%lu current=%lu).\n",
- shm_info->page_size, srv_page_size);
- return(NULL);
- }
- if (!shm_info->reusable) {
- fprintf(stderr,
- "InnoDB: Warning: The shared memory has unrecoverable contents.\n"
- "InnoDB: The shared memory segment is initialized.\n");
- is_new = TRUE;
- goto init_again;
- }
- if (!shm_info->clean) {
- fprintf(stderr,
- "InnoDB: Warning: The shared memory was not shut down cleanly.\n"
- "InnoDB: The shared memory segment is initialized.\n");
- is_new = TRUE;
- goto init_again;
- }
-
- ut_a(shm_info->zip_hash_offset == chunk->mem_size - zip_hash_mem_size);
- ut_a(shm_info->zip_hash_n == zip_hash_n);
-
- /* check checksum */
- if (srv_buffer_pool_shm_checksum) {
- checksum = ut_fold_binary_32((byte*)chunk->mem + sizeof(buf_shm_info_t),
- chunk->mem_size - sizeof(buf_shm_info_t));
- } else {
- checksum = BUF_NO_CHECKSUM_MAGIC;
- }
-
- if (shm_info->checksum != BUF_NO_CHECKSUM_MAGIC
- && shm_info->checksum != checksum) {
- fprintf(stderr,
- "InnoDB: Error: checksum of the shared memory is not match. "
- "(stored=%lu calculated=%lu)\n",
- shm_info->checksum, checksum);
- return(NULL);
- }
-
- /* flag to use the segment. */
- shm_info->clean = FALSE; /* changed to TRUE when free the segment. */
- }
-
- /* init zip_hash contents */
- if (is_new) {
- hash_create_init(zip_hash_tmp, zip_hash_n);
- } else {
- /* adjust offset is done later */
- hash_create_reuse(zip_hash_tmp);
-
- srv_buffer_pool_shm_is_reused = TRUE;
- }
- } else {
chunk->mem = os_mem_alloc_large(&chunk->mem_size);
if (UNIV_UNLIKELY(chunk->mem == NULL)) {
return(NULL);
}
- }
/* Allocate the block descriptors from
the start of the memory block. */
- if (srv_buffer_pool_shm_key) {
- chunk->blocks = (buf_block_t*)((byte*)chunk->mem + sizeof(buf_shm_info_t));
- } else {
chunk->blocks = chunk->mem;
- }
/* Align a pointer to the first frame. Note that when
os_large_page_size is smaller than UNIV_PAGE_SIZE,
@@ -1026,13 +801,8 @@ init_again:
it is bigger, we may allocate more blocks than requested. */
frame = ut_align(chunk->mem, UNIV_PAGE_SIZE);
- if (srv_buffer_pool_shm_key) {
- /* reserve zip_hash space and always -1 for reproductibity */
- chunk->size = (chunk->mem_size - zip_hash_mem_size) / UNIV_PAGE_SIZE - 1;
- } else {
chunk->size = chunk->mem_size / UNIV_PAGE_SIZE
- (frame != chunk->mem);
- }
/* Subtract the space needed for block descriptors. */
{
@@ -1050,99 +820,6 @@ init_again:
chunk->size = size_target;
}
- if (shm_info && !(shm_info->is_new)) {
- /* convert the shared memory segment for reuse */
- ptrdiff_t phys_offset;
- ptrdiff_t logi_offset;
- ptrdiff_t blocks_offset;
- void* previous_frame_address;
-
- if (chunk->size < shm_info->chunk_backup.size) {
- fprintf(stderr,
- "InnoDB: Error: The buffer pool became smaller because of allocated address.\n"
- "InnoDB: Retrying may avoid this situation.\n");
- shm_info->clean = TRUE; /* release the flag for retrying */
- return(NULL);
- }
-
- chunk->size = shm_info->chunk_backup.size;
- phys_offset = frame - ((byte*)chunk->mem + shm_info->frame_offset);
- logi_offset = frame - chunk->blocks[0].frame;
- previous_frame_address = chunk->blocks[0].frame;
- blocks_offset = (byte*)chunk->blocks - (byte*)shm_info->chunk_backup.blocks;
-
- if (phys_offset || logi_offset || blocks_offset) {
- fprintf(stderr,
- "InnoDB: Buffer pool in the shared memory segment should be converted.\n"
- "InnoDB: Previous frames in address : %p\n"
- "InnoDB: Previous frames were located : %p\n"
- "InnoDB: Current frames should be located: %p\n"
- "InnoDB: Pysical offset : %ld (%#lx)\n"
- "InnoDB: Logical offset (frames) : %ld (%#lx)\n"
- "InnoDB: Logical offset (blocks) : %ld (%#lx)\n",
- (byte*)chunk->mem + shm_info->frame_offset,
- chunk->blocks[0].frame, frame,
- (long) phys_offset, (long) phys_offset,
- (long) logi_offset, (long) logi_offset,
- (long) blocks_offset, (long) blocks_offset);
- } else {
- fprintf(stderr,
- "InnoDB: Buffer pool in the shared memory segment can be used as it is.\n");
- }
-
- if (phys_offset) {
- fprintf(stderr,
- "InnoDB: Aligning physical offset...");
-
- memmove(frame, (byte*)chunk->mem + shm_info->frame_offset,
- chunk->size * UNIV_PAGE_SIZE);
-
- fprintf(stderr,
- " Done.\n");
- }
-
- /* buf_block_t */
- block = chunk->blocks;
- for (i = chunk->size; i--; ) {
- buf_block_reuse(block, logi_offset);
- block++;
- }
-
- if (logi_offset || blocks_offset) {
- fprintf(stderr,
- "InnoDB: Aligning logical offset...");
-
-
- /* buf_pool_t buf_pool_backup */
- UT_LIST_OFFSET(flush_list, buf_page_t, shm_info->buf_pool_backup.flush_list,
- previous_frame_address, logi_offset, blocks_offset);
- UT_LIST_OFFSET(free, buf_page_t, shm_info->buf_pool_backup.free,
- previous_frame_address, logi_offset, blocks_offset);
- UT_LIST_OFFSET(LRU, buf_page_t, shm_info->buf_pool_backup.LRU,
- previous_frame_address, logi_offset, blocks_offset);
- if (shm_info->buf_pool_backup.LRU_old)
- shm_info->buf_pool_backup.LRU_old =
- (buf_page_t*)((byte*)(shm_info->buf_pool_backup.LRU_old)
- + (((void*)shm_info->buf_pool_backup.LRU_old > previous_frame_address)
- ? logi_offset : blocks_offset));
-
- UT_LIST_OFFSET(unzip_LRU, buf_block_t, shm_info->buf_pool_backup.unzip_LRU,
- previous_frame_address, logi_offset, blocks_offset);
-
- UT_LIST_OFFSET(zip_list, buf_page_t, shm_info->buf_pool_backup.zip_clean,
- previous_frame_address, logi_offset, blocks_offset);
- for (i = 0; i < BUF_BUDDY_SIZES_MAX; i++) {
- UT_LIST_OFFSET(zip_list, buf_page_t, shm_info->buf_pool_backup.zip_free[i],
- previous_frame_address, logi_offset, blocks_offset);
- }
-
- HASH_OFFSET(zip_hash_tmp, buf_page_t, hash,
- previous_frame_address, logi_offset, blocks_offset);
-
- fprintf(stderr,
- " Done.\n");
- }
- } else {
/* Init block structs and assign frames for them. Then we
assign the frames to the first blocks (we already mapped the
memory above). */
@@ -1166,11 +843,6 @@ init_again:
block++;
frame += UNIV_PAGE_SIZE;
}
- }
-
- if (shm_info) {
- shm_info->frame_offset = chunk->blocks[0].frame - (byte*)chunk->mem;
- }
return(chunk);
}
@@ -1287,76 +959,6 @@ buf_chunk_not_freed(
return(NULL);
}
-/*********************************************************************//**
-Checks that all blocks in the buffer chunk are in BUF_BLOCK_NOT_USED state.
-@return TRUE if all freed */
-static
-ibool
-buf_chunk_all_free(
-/*===============*/
- const buf_chunk_t* chunk) /*!< in: chunk being checked */
-{
- const buf_block_t* block;
- ulint i;
-
- ut_ad(buf_pool);
- ut_ad(buf_pool_mutex_own()); /* but we need all mutex here */
-
- block = chunk->blocks;
-
- for (i = chunk->size; i--; block++) {
-
- if (buf_block_get_state(block) != BUF_BLOCK_NOT_USED) {
-
- return(FALSE);
- }
- }
-
- return(TRUE);
-}
-
-/********************************************************************//**
-Frees a chunk of buffer frames. */
-static
-void
-buf_chunk_free(
-/*===========*/
- buf_chunk_t* chunk) /*!< out: chunk of buffers */
-{
- buf_block_t* block;
- const buf_block_t* block_end;
-
- ut_ad(buf_pool_mutex_own()); /* but we need all mutex here */
-
- block_end = chunk->blocks + chunk->size;
-
- for (block = chunk->blocks; block < block_end; block++) {
- ut_a(buf_block_get_state(block) == BUF_BLOCK_NOT_USED);
- ut_a(!block->page.zip.data);
-
- ut_ad(!block->page.in_LRU_list);
- ut_ad(!block->in_unzip_LRU_list);
- ut_ad(!block->page.in_flush_list);
- /* Remove the block from the free list. */
- mutex_enter(&free_list_mutex);
- ut_ad(block->page.in_free_list);
- UT_LIST_REMOVE(free, buf_pool->free, (&block->page));
- mutex_exit(&free_list_mutex);
-
- /* Free the latches. */
- mutex_free(&block->mutex);
- rw_lock_free(&block->lock);
-#ifdef UNIV_SYNC_DEBUG
- rw_lock_free(&block->debug_latch);
-#endif /* UNIV_SYNC_DEBUG */
- UNIV_MEM_UNDESC(block);
- }
-
- ut_a(!srv_buffer_pool_shm_key);
-
- os_mem_free_large(chunk->mem, chunk->mem_size);
-}
-
/********************************************************************//**
Creates the buffer pool.
@return own: buf_pool object, NULL if not enough memory or error */
@@ -1403,10 +1005,7 @@ buf_pool_init(void)
srv_buf_pool_curr_size = buf_pool->curr_size * UNIV_PAGE_SIZE;
buf_pool->page_hash = hash_create(2 * buf_pool->curr_size);
- /* zip_hash is allocated to shm when srv_buffer_pool_shm_key is enabled */
- if (!srv_buffer_pool_shm_key) {
buf_pool->zip_hash = hash_create(2 * buf_pool->curr_size);
- }
buf_pool->last_printout_time = time(NULL);
@@ -1421,86 +1020,6 @@ buf_pool_init(void)
--------------------------- */
/* All fields are initialized by mem_zalloc(). */
- if (srv_buffer_pool_shm_key) {
- buf_shm_info_t* shm_info;
-
- ut_a((byte*)chunk->blocks == (byte*)chunk->mem + sizeof(buf_shm_info_t));
- shm_info = chunk->mem;
-
- buf_pool->zip_hash = (hash_table_t*)((byte*)chunk->mem + shm_info->zip_hash_offset);
-
- if(shm_info->is_new) {
- shm_info->is_new = FALSE; /* initialization was finished */
- } else {
- buf_block_t* block = chunk->blocks;
- buf_page_t* b;
-
- /* shm_info->buf_pool_backup should be converted */
- /* at buf_chunk_init(). So copy simply. */
- buf_pool->flush_list = shm_info->buf_pool_backup.flush_list;
- buf_pool->freed_page_clock = shm_info->buf_pool_backup.freed_page_clock;
- buf_pool->free = shm_info->buf_pool_backup.free;
- buf_pool->LRU = shm_info->buf_pool_backup.LRU;
- buf_pool->LRU_old = shm_info->buf_pool_backup.LRU_old;
- buf_pool->LRU_old_len = shm_info->buf_pool_backup.LRU_old_len;
- buf_pool->unzip_LRU = shm_info->buf_pool_backup.unzip_LRU;
- buf_pool->zip_clean = shm_info->buf_pool_backup.zip_clean;
- for (i = 0; i < BUF_BUDDY_SIZES_MAX; i++) {
- buf_pool->zip_free[i] = shm_info->buf_pool_backup.zip_free[i];
- }
-
- for (i = 0; i < chunk->size; i++, block++) {
- if (buf_block_get_state(block)
- == BUF_BLOCK_FILE_PAGE) {
- ut_d(block->page.in_page_hash = TRUE);
- HASH_INSERT(buf_page_t, hash, buf_pool->page_hash,
- buf_page_address_fold(
- block->page.space,
- block->page.offset),
- &block->page);
- }
- }
-
- for (b = UT_LIST_GET_FIRST(buf_pool->zip_clean); b;
- b = UT_LIST_GET_NEXT(zip_list, b)) {
- ut_ad(!b->in_flush_list);
- ut_ad(b->in_LRU_list);
-
- ut_d(b->in_page_hash = TRUE);
- HASH_INSERT(buf_page_t, hash, buf_pool->page_hash,
- buf_page_address_fold(b->space, b->offset), b);
- }
-
- for (b = UT_LIST_GET_FIRST(buf_pool->flush_list); b;
- b = UT_LIST_GET_NEXT(flush_list, b)) {
- ut_ad(b->in_flush_list);
- ut_ad(b->in_LRU_list);
-
- switch (buf_page_get_state(b)) {
- case BUF_BLOCK_ZIP_DIRTY:
- ut_d(b->in_page_hash = TRUE);
- HASH_INSERT(buf_page_t, hash, buf_pool->page_hash,
- buf_page_address_fold(b->space,
- b->offset), b);
- break;
- case BUF_BLOCK_FILE_PAGE:
- /* uncompressed page */
- break;
- case BUF_BLOCK_ZIP_FREE:
- case BUF_BLOCK_ZIP_PAGE:
- case BUF_BLOCK_NOT_USED:
- case BUF_BLOCK_READY_FOR_USE:
- case BUF_BLOCK_MEMORY:
- case BUF_BLOCK_REMOVE_HASH:
- ut_error;
- break;
- }
- }
-
-
- }
- }
-
mutex_exit(&LRU_list_mutex);
rw_lock_x_unlock(&page_hash_latch);
buf_pool_mutex_exit();
@@ -1525,49 +1044,16 @@ buf_pool_free(void)
buf_chunk_t* chunk;
buf_chunk_t* chunks;
- if (srv_buffer_pool_shm_key) {
- buf_shm_info_t* shm_info;
-
- ut_a(buf_pool->n_chunks == 1);
-
- chunk = buf_pool->chunks;
- shm_info = chunk->mem;
- ut_a((byte*)chunk->blocks == (byte*)chunk->mem + sizeof(buf_shm_info_t));
-
- /* validation the shared memory segment doesn't have unrecoverable contents. */
- /* Currently, validation became not needed */
- shm_info->reusable = TRUE;
-
- memcpy(&(shm_info->buf_pool_backup), buf_pool, sizeof(buf_pool_t));
- memcpy(&(shm_info->chunk_backup), chunk, sizeof(buf_chunk_t));
-
- if (srv_fast_shutdown < 2) {
- if (srv_buffer_pool_shm_checksum) {
- shm_info->checksum = ut_fold_binary_32((byte*)chunk->mem + sizeof(buf_shm_info_t),
- chunk->mem_size - sizeof(buf_shm_info_t));
- } else {
- shm_info->checksum = BUF_NO_CHECKSUM_MAGIC;
- }
- shm_info->clean = TRUE;
- }
-
- os_shm_free(chunk->mem, chunk->mem_size);
- } else {
chunks = buf_pool->chunks;
chunk = chunks + buf_pool->n_chunks;
while (--chunk >= chunks) {
- /* Bypass the checks of buf_chunk_free(), since they
- would fail at shutdown. */
os_mem_free_large(chunk->mem, chunk->mem_size);
}
- }
mem_free(buf_pool->chunks);
hash_table_free(buf_pool->page_hash);
- if (!srv_buffer_pool_shm_key) {
hash_table_free(buf_pool->zip_hash);
- }
mem_free(buf_pool);
buf_pool = NULL;
}
@@ -1741,335 +1227,6 @@ buf_relocate(
}
/********************************************************************//**
-Shrinks the buffer pool. */
-static
-void
-buf_pool_shrink(
-/*============*/
- ulint chunk_size) /*!< in: number of pages to remove */
-{
- buf_chunk_t* chunks;
- buf_chunk_t* chunk;
- ulint max_size;
- ulint max_free_size;
- buf_chunk_t* max_chunk;
- buf_chunk_t* max_free_chunk;
-
- ut_ad(!buf_pool_mutex_own());
-
-try_again:
- btr_search_disable(); /* Empty the adaptive hash index again */
- //buf_pool_mutex_enter();
- mutex_enter(&LRU_list_mutex);
-
- if (srv_buffer_pool_shm_key) {
- /* Cannot support shrink */
- goto func_done;
- }
-
-shrink_again:
- if (buf_pool->n_chunks <= 1) {
-
- /* Cannot shrink if there is only one chunk */
- goto func_done;
- }
-
- /* Search for the largest free chunk
- not larger than the size difference */
- chunks = buf_pool->chunks;
- chunk = chunks + buf_pool->n_chunks;
- max_size = max_free_size = 0;
- max_chunk = max_free_chunk = NULL;
-
- while (--chunk >= chunks) {
- if (chunk->size <= chunk_size
- && chunk->size > max_free_size) {
- if (chunk->size > max_size) {
- max_size = chunk->size;
- max_chunk = chunk;
- }
-
- if (buf_chunk_all_free(chunk)) {
- max_free_size = chunk->size;
- max_free_chunk = chunk;
- }
- }
- }
-
- if (!max_free_size) {
-
- ulint dirty = 0;
- ulint nonfree = 0;
- buf_block_t* block;
- buf_block_t* bend;
-
- /* Cannot shrink: try again later
- (do not assign srv_buf_pool_old_size) */
- if (!max_chunk) {
-
- goto func_exit;
- }
-
- block = max_chunk->blocks;
- bend = block + max_chunk->size;
-
- /* Move the blocks of chunk to the end of the
- LRU list and try to flush them. */
- for (; block < bend; block++) {
- switch (buf_block_get_state(block)) {
- case BUF_BLOCK_NOT_USED:
- continue;
- case BUF_BLOCK_FILE_PAGE:
- break;
- default:
- nonfree++;
- continue;
- }
-
- mutex_enter(&block->mutex);
- /* The following calls will temporarily
- release block->mutex and buf_pool_mutex.
- Therefore, we have to always retry,
- even if !dirty && !nonfree. */
-
- if (!buf_flush_ready_for_replace(&block->page)) {
-
- buf_LRU_make_block_old(&block->page);
- dirty++;
- } else if (buf_LRU_free_block(&block->page, TRUE, FALSE)
- != BUF_LRU_FREED) {
- nonfree++;
- }
-
- mutex_exit(&block->mutex);
- }
-
- //buf_pool_mutex_exit();
- mutex_exit(&LRU_list_mutex);
-
- /* Request for a flush of the chunk if it helps.
- Do not flush if there are non-free blocks, since
- flushing will not make the chunk freeable. */
- if (nonfree) {
- /* Avoid busy-waiting. */
- os_thread_sleep(100000);
- } else if (dirty
- && buf_flush_batch(BUF_FLUSH_LRU, dirty, 0)
- == ULINT_UNDEFINED) {
-
- buf_flush_wait_batch_end(BUF_FLUSH_LRU);
- }
-
- goto try_again;
- }
-
- max_size = max_free_size;
- max_chunk = max_free_chunk;
-
- srv_buf_pool_old_size = srv_buf_pool_size;
-
- /* Rewrite buf_pool->chunks. Copy everything but max_chunk. */
- chunks = mem_alloc((buf_pool->n_chunks - 1) * sizeof *chunks);
- memcpy(chunks, buf_pool->chunks,
- (max_chunk - buf_pool->chunks) * sizeof *chunks);
- memcpy(chunks + (max_chunk - buf_pool->chunks),
- max_chunk + 1,
- buf_pool->chunks + buf_pool->n_chunks
- - (max_chunk + 1));
- ut_a(buf_pool->curr_size > max_chunk->size);
- buf_pool->curr_size -= max_chunk->size;
- srv_buf_pool_curr_size = buf_pool->curr_size * UNIV_PAGE_SIZE;
- chunk_size -= max_chunk->size;
- buf_chunk_free(max_chunk);
- mem_free(buf_pool->chunks);
- buf_pool->chunks = chunks;
- buf_pool->n_chunks--;
-
- /* Allow a slack of one megabyte. */
- if (chunk_size > 1048576 / UNIV_PAGE_SIZE) {
-
- goto shrink_again;
- }
-
-func_done:
- srv_buf_pool_old_size = srv_buf_pool_size;
-func_exit:
- //buf_pool_mutex_exit();
- mutex_exit(&LRU_list_mutex);
- btr_search_enable();
-}
-
-/********************************************************************//**
-Rebuild buf_pool->page_hash. */
-static
-void
-buf_pool_page_hash_rebuild(void)
-/*============================*/
-{
- ulint i;
- ulint n_chunks;
- buf_chunk_t* chunk;
- hash_table_t* page_hash;
- hash_table_t* zip_hash;
- buf_page_t* b;
-
- //buf_pool_mutex_enter();
- mutex_enter(&LRU_list_mutex);
- rw_lock_x_lock(&page_hash_latch);
- mutex_enter(&flush_list_mutex);
-
-
- /* Free, create, and populate the hash table. */
- hash_table_free(buf_pool->page_hash);
- buf_pool->page_hash = page_hash = hash_create(2 * buf_pool->curr_size);
- zip_hash = hash_create(2 * buf_pool->curr_size);
-
- HASH_MIGRATE(buf_pool->zip_hash, zip_hash, buf_page_t, hash,
- BUF_POOL_ZIP_FOLD_BPAGE);
-
- hash_table_free(buf_pool->zip_hash);
- buf_pool->zip_hash = zip_hash;
-
- /* Insert the uncompressed file pages to buf_pool->page_hash. */
-
- chunk = buf_pool->chunks;
- n_chunks = buf_pool->n_chunks;
-
- for (i = 0; i < n_chunks; i++, chunk++) {
- ulint j;
- buf_block_t* block = chunk->blocks;
-
- for (j = 0; j < chunk->size; j++, block++) {
- if (buf_block_get_state(block)
- == BUF_BLOCK_FILE_PAGE) {
- ut_ad(!block->page.in_zip_hash);
- ut_ad(block->page.in_page_hash);
-
- HASH_INSERT(buf_page_t, hash, page_hash,
- buf_page_address_fold(
- block->page.space,
- block->page.offset),
- &block->page);
- }
- }
- }
-
- /* Insert the compressed-only pages to buf_pool->page_hash.
- All such blocks are either in buf_pool->zip_clean or
- in buf_pool->flush_list. */
-
- for (b = UT_LIST_GET_FIRST(buf_pool->zip_clean); b;
- b = UT_LIST_GET_NEXT(zip_list, b)) {
- ut_a(buf_page_get_state(b) == BUF_BLOCK_ZIP_PAGE);
- ut_ad(!b->in_flush_list);
- ut_ad(b->in_LRU_list);
- ut_ad(b->in_page_hash);
- ut_ad(!b->in_zip_hash);
-
- HASH_INSERT(buf_page_t, hash, page_hash,
- buf_page_address_fold(b->space, b->offset), b);
- }
-
- for (b = UT_LIST_GET_FIRST(buf_pool->flush_list); b;
- b = UT_LIST_GET_NEXT(flush_list, b)) {
- ut_ad(b->in_flush_list);
- ut_ad(b->in_LRU_list);
- ut_ad(b->in_page_hash);
- ut_ad(!b->in_zip_hash);
-
- switch (buf_page_get_state(b)) {
- case BUF_BLOCK_ZIP_DIRTY:
- HASH_INSERT(buf_page_t, hash, page_hash,
- buf_page_address_fold(b->space,
- b->offset), b);
- break;
- case BUF_BLOCK_FILE_PAGE:
- /* uncompressed page */
- break;
- case BUF_BLOCK_ZIP_FREE:
- case BUF_BLOCK_ZIP_PAGE:
- case BUF_BLOCK_NOT_USED:
- case BUF_BLOCK_READY_FOR_USE:
- case BUF_BLOCK_MEMORY:
- case BUF_BLOCK_REMOVE_HASH:
- ut_error;
- break;
- }
- }
-
- //buf_pool_mutex_exit();
- mutex_exit(&LRU_list_mutex);
- rw_lock_x_unlock(&page_hash_latch);
- mutex_exit(&flush_list_mutex);
-}
-
-/********************************************************************//**
-Resizes the buffer pool. */
-UNIV_INTERN
-void
-buf_pool_resize(void)
-/*=================*/
-{
- if (srv_buffer_pool_shm_key) {
- /* Cannot support resize */
- return;
- }
-
- //buf_pool_mutex_enter();
- mutex_enter(&LRU_list_mutex);
-
- if (srv_buf_pool_old_size == srv_buf_pool_size) {
-
- //buf_pool_mutex_exit();
- mutex_exit(&LRU_list_mutex);
- return;
- }
-
- if (srv_buf_pool_curr_size + 1048576 > srv_buf_pool_size) {
-
- //buf_pool_mutex_exit();
- mutex_exit(&LRU_list_mutex);
-
- /* Disable adaptive hash indexes and empty the index
- in order to free up memory in the buffer pool chunks. */
- buf_pool_shrink((srv_buf_pool_curr_size - srv_buf_pool_size)
- / UNIV_PAGE_SIZE);
- } else if (srv_buf_pool_curr_size + 1048576 < srv_buf_pool_size) {
-
- /* Enlarge the buffer pool by at least one megabyte */
-
- ulint mem_size
- = srv_buf_pool_size - srv_buf_pool_curr_size;
- buf_chunk_t* chunks;
- buf_chunk_t* chunk;
-
- chunks = mem_alloc((buf_pool->n_chunks + 1) * sizeof *chunks);
-
- memcpy(chunks, buf_pool->chunks, buf_pool->n_chunks
- * sizeof *chunks);
-
- chunk = &chunks[buf_pool->n_chunks];
-
- if (!buf_chunk_init(chunk, mem_size)) {
- mem_free(chunks);
- } else {
- buf_pool->curr_size += chunk->size;
- srv_buf_pool_curr_size = buf_pool->curr_size
- * UNIV_PAGE_SIZE;
- mem_free(buf_pool->chunks);
- buf_pool->chunks = chunks;
- buf_pool->n_chunks++;
- }
-
- srv_buf_pool_old_size = srv_buf_pool_size;
- //buf_pool_mutex_exit();
- mutex_exit(&LRU_list_mutex);
- }
-
- buf_pool_page_hash_rebuild();
-}
-
-/********************************************************************//**
Moves a page to the start of the buffer pool LRU list. This high-level
function can be used to prevent an important page from slipping out of
the buffer pool. */
@@ -2303,6 +1460,27 @@ lookup:
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
}
+ if (UNIV_UNLIKELY(bpage->space_was_being_deleted)) {
+ /* This page is obsoleted, should discard and retry */
+ rw_lock_s_unlock(&page_hash_latch);
+
+ mutex_enter(&LRU_list_mutex);
+ block_mutex = buf_page_get_mutex_enter(bpage);
+
+ if (UNIV_UNLIKELY(!block_mutex)) {
+ mutex_exit(&LRU_list_mutex);
+ goto lookup;
+ }
+
+ buf_LRU_free_block(bpage, TRUE, TRUE);
+
+ mutex_exit(&LRU_list_mutex);
+ mutex_exit(block_mutex);
+ block_mutex = NULL;
+
+ goto lookup;
+ }
+
if (UNIV_UNLIKELY(!bpage->zip.data)) {
/* There is no compressed page. */
err_exit:
@@ -2311,13 +1489,12 @@ err_exit:
return(NULL);
}
- if (srv_pass_corrupt_table) {
+ if (srv_pass_corrupt_table <= 1) {
if (bpage->is_corrupt) {
rw_lock_s_unlock(&page_hash_latch);
return(NULL);
}
}
- ut_a(!(bpage->is_corrupt));
block_mutex = buf_page_get_mutex_enter(bpage);
@@ -2340,13 +1517,32 @@ err_exit:
case BUF_BLOCK_FILE_PAGE:
ut_a(block_mutex == &((buf_block_t*) bpage)->mutex);
- /* Discard the uncompressed page frame if possible. */
- if (buf_LRU_free_block(bpage, FALSE, FALSE) == BUF_LRU_FREED) {
+ /* release mutex to obey to latch-order */
+ mutex_exit(block_mutex);
+
+ /* get LRU_list_mutex for buf_LRU_free_block() */
+ mutex_enter(&LRU_list_mutex);
+ mutex_enter(block_mutex);
+ if (UNIV_UNLIKELY(bpage->space != space
+ || bpage->offset != offset
+ || !bpage->in_LRU_list
+ || !bpage->zip.data)) {
+ /* someone should interrupt, retry */
+ mutex_exit(&LRU_list_mutex);
+ mutex_exit(block_mutex);
+ goto lookup;
+ }
+
+ /* Discard the uncompressed page frame if possible. */
+ if (buf_LRU_free_block(bpage, FALSE, TRUE)) {
+ mutex_exit(&LRU_list_mutex);
mutex_exit(block_mutex);
goto lookup;
}
+ mutex_exit(&LRU_list_mutex);
+
buf_block_buf_fix_inc((buf_block_t*) bpage,
__FILE__, __LINE__);
goto got_block;
@@ -2516,16 +1712,19 @@ buf_block_align(
/* TODO: protect buf_pool->chunks with a mutex (it will
currently remain constant after buf_pool_init()) */
for (chunk = buf_pool->chunks, i = buf_pool->n_chunks; i--; chunk++) {
- lint offs = ptr - chunk->blocks->frame;
+ ulint offs;
- if (UNIV_UNLIKELY(offs < 0)) {
+ if (UNIV_UNLIKELY(ptr < chunk->blocks->frame)) {
continue;
}
+ /* else */
+
+ offs = ptr - chunk->blocks->frame;
offs >>= UNIV_PAGE_SIZE_SHIFT;
- if (UNIV_LIKELY((ulint) offs < chunk->size)) {
+ if (UNIV_LIKELY(offs < chunk->size)) {
buf_block_t* block = &chunk->blocks[offs];
/* The function buf_chunk_init() invokes
@@ -2651,7 +1850,7 @@ buf_page_get_gen(
ulint rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH */
buf_block_t* guess, /*!< in: guessed block or NULL */
ulint mode, /*!< in: BUF_GET, BUF_GET_IF_IN_POOL,
- BUF_GET_NO_LATCH */
+ BUF_PEEK_IF_IN_POOL, BUF_GET_NO_LATCH */
const char* file, /*!< in: file name */
ulint line, /*!< in: line where called */
mtr_t* mtr) /*!< in: mini-transaction */
@@ -2661,7 +1860,7 @@ buf_page_get_gen(
ulint fix_type;
ibool must_read;
ulint retries = 0;
- mutex_t* block_mutex= NULL;
+ mutex_t* block_mutex = NULL;
trx_t* trx = NULL;
ulint sec;
ulint ms;
@@ -2673,9 +1872,19 @@ buf_page_get_gen(
ut_ad((rw_latch == RW_S_LATCH)
|| (rw_latch == RW_X_LATCH)
|| (rw_latch == RW_NO_LATCH));
- ut_ad((mode != BUF_GET_NO_LATCH) || (rw_latch == RW_NO_LATCH));
- ut_ad((mode == BUF_GET) || (mode == BUF_GET_IF_IN_POOL)
- || (mode == BUF_GET_NO_LATCH));
+#ifdef UNIV_DEBUG
+ switch (mode) {
+ case BUF_GET_NO_LATCH:
+ ut_ad(rw_latch == RW_NO_LATCH);
+ break;
+ case BUF_GET:
+ case BUF_GET_IF_IN_POOL:
+ case BUF_PEEK_IF_IN_POOL:
+ break;
+ default:
+ ut_error;
+ }
+#endif /* UNIV_DEBUG */
ut_ad(zip_size == fil_space_get_zip_size(space));
ut_ad(ut_is_2pow(zip_size));
#ifndef UNIV_LOG_DEBUG
@@ -2693,13 +1902,8 @@ loop:
block_mutex = buf_page_get_mutex_enter((buf_page_t*)block);
/* If the guess is a compressed page descriptor that
- has been allocated by buf_buddy_alloc(), it may have
- been invalidated by buf_buddy_relocate(). In that
- case, block could point to something that happens to
- contain the expected bits in block->page. Similarly,
- the guess may be pointing to a buffer pool chunk that
- has been released when resizing the buffer pool. */
-
+ has been allocated by buf_page_alloc_descriptor(),
+ it may have been freed by buf_relocate(). */
if (!block_mutex) {
block = guess = NULL;
} else if (!buf_block_is_uncompressed(block)
@@ -2720,6 +1924,27 @@ loop:
rw_lock_s_lock(&page_hash_latch);
block = (buf_block_t*) buf_page_hash_get(space, offset);
if (block) {
+ if (UNIV_UNLIKELY(block->page.space_was_being_deleted)) {
+ /* This page is obsoleted, should discard and retry */
+ rw_lock_s_unlock(&page_hash_latch);
+
+ mutex_enter(&LRU_list_mutex);
+ block_mutex = buf_page_get_mutex_enter((buf_page_t*)block);
+
+ if (UNIV_UNLIKELY(!block_mutex)) {
+ mutex_exit(&LRU_list_mutex);
+ goto loop;
+ }
+
+ buf_LRU_free_block((buf_page_t*)block, TRUE, TRUE);
+
+ mutex_exit(&LRU_list_mutex);
+ mutex_exit(block_mutex);
+ block_mutex = NULL;
+
+ goto loop;
+ }
+
block_mutex = buf_page_get_mutex_enter((buf_page_t*)block);
ut_a(block_mutex);
}
@@ -2732,7 +1957,8 @@ loop2:
//buf_pool_mutex_exit();
- if (mode == BUF_GET_IF_IN_POOL) {
+ if (mode == BUF_GET_IF_IN_POOL
+ || mode == BUF_PEEK_IF_IN_POOL) {
return(NULL);
}
@@ -2771,7 +1997,8 @@ loop2:
must_read = buf_block_get_io_fix(block) == BUF_IO_READ;
- if (must_read && mode == BUF_GET_IF_IN_POOL) {
+ if (must_read && (mode == BUF_GET_IF_IN_POOL
+ || mode == BUF_PEEK_IF_IN_POOL)) {
/* The page is only being read to buffer */
//buf_pool_mutex_exit();
mutex_exit(block_mutex);
@@ -2779,13 +2006,12 @@ loop2:
return(NULL);
}
- if (srv_pass_corrupt_table) {
+ if (srv_pass_corrupt_table <= 1) {
if (block->page.is_corrupt) {
mutex_exit(block_mutex);
return(NULL);
}
}
- ut_a(!(block->page.is_corrupt));
switch (buf_block_get_state(block)) {
buf_page_t* bpage;
@@ -2897,8 +2123,10 @@ wait_until_unfixed:
if (buf_page_get_state(&block->page)
== BUF_BLOCK_ZIP_PAGE) {
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
UT_LIST_REMOVE(zip_list, buf_pool->zip_clean,
&block->page);
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
ut_ad(!block->page.in_flush_list);
} else {
/* Relocate buf_pool->flush_list. */
@@ -2931,10 +2159,10 @@ wait_until_unfixed:
buf_pool->n_pend_unzip++;
mutex_exit(&buf_pool_mutex);
- buf_buddy_free(bpage, sizeof *bpage, FALSE);
-
//buf_pool_mutex_exit();
+ buf_page_free_descriptor(bpage);
+
/* Decompress the page and apply buffered operations
while not holding buf_pool_mutex or block->mutex. */
success = buf_zip_decompress(block, srv_use_checksums);
@@ -2981,9 +2209,9 @@ wait_until_unfixed:
/* Try to evict the block from the buffer pool, to use the
insert buffer as much as possible. */
- if (buf_LRU_free_block(&block->page, TRUE, FALSE) == BUF_LRU_FREED) {
- buf_pool_mutex_exit();
- mutex_exit(&block->mutex);
+ if (buf_LRU_free_block(&block->page, TRUE, FALSE)) {
+ //buf_pool_mutex_exit();
+ mutex_exit(block_mutex);
fprintf(stderr,
"innodb_change_buffering_debug evict %u %u\n",
(unsigned) space, (unsigned) offset);
@@ -3011,7 +2239,9 @@ wait_until_unfixed:
//buf_pool_mutex_exit();
mutex_exit(block_mutex);
- buf_page_set_accessed_make_young(&block->page, access_time);
+ if (UNIV_LIKELY(mode != BUF_PEEK_IF_IN_POOL)) {
+ buf_page_set_accessed_make_young(&block->page, access_time);
+ }
#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
ut_a(!block->page.file_page_was_freed);
@@ -3077,7 +2307,7 @@ wait_until_unfixed:
mtr_memo_push(mtr, block, fix_type);
- if (!access_time) {
+ if (UNIV_LIKELY(mode != BUF_PEEK_IF_IN_POOL) && !access_time) {
/* In the case of a first access, try to apply linear
read-ahead */
@@ -3531,6 +2761,7 @@ buf_page_init_for_read(
{
buf_block_t* block;
buf_page_t* bpage;
+ buf_page_t* bpage_in_bp;
mtr_t mtr;
ibool lru = FALSE;
void* data;
@@ -3566,11 +2797,29 @@ buf_page_init_for_read(
ut_ad(block);
}
+retry:
//buf_pool_mutex_enter();
mutex_enter(&LRU_list_mutex);
rw_lock_x_lock(&page_hash_latch);
- if (buf_page_hash_get(space, offset)) {
+ bpage_in_bp = buf_page_hash_get(space, offset);
+
+ if (UNIV_UNLIKELY(bpage_in_bp && bpage_in_bp->space_was_being_deleted)) {
+ mutex_t* block_mutex = buf_page_get_mutex_enter(bpage_in_bp);
+
+ /* This page is obsoleted, should discard and retry */
+ rw_lock_x_unlock(&page_hash_latch);
+ ut_a(block_mutex);
+
+ buf_LRU_free_block(bpage_in_bp, TRUE, TRUE);
+
+ mutex_exit(&LRU_list_mutex);
+ mutex_exit(block_mutex);
+
+ goto retry;
+ }
+
+ if (bpage_in_bp) {
/* The page is already in the buffer pool. */
err_exit:
if (block) {
@@ -3648,17 +2897,12 @@ err_exit:
mutex_exit(&LRU_list_mutex);
mutex_exit(&block->mutex);
} else {
- /* Defer buf_buddy_alloc() until after the block has
- been found not to exist. The buf_buddy_alloc() and
- buf_buddy_free() calls may be expensive because of
- buf_buddy_relocate(). */
/* The compressed page must be allocated before the
control block (bpage), in order to avoid the
invocation of buf_buddy_relocate_block() on
uninitialized data. */
data = buf_buddy_alloc(zip_size, &lru, TRUE);
- bpage = buf_buddy_alloc(sizeof *bpage, &lru, TRUE);
/* If buf_buddy_alloc() allocated storage from the LRU list,
it released and reacquired buf_pool_mutex. Thus, we must
@@ -3666,17 +2910,16 @@ err_exit:
if (UNIV_UNLIKELY(lru)
&& UNIV_LIKELY_NULL(buf_page_hash_get(space, offset))) {
- /* The block was added by some other thread. */
- buf_buddy_free(bpage, sizeof *bpage, TRUE);
buf_buddy_free(data, zip_size, TRUE);
mutex_exit(&LRU_list_mutex);
rw_lock_x_unlock(&page_hash_latch);
-
bpage = NULL;
goto func_exit;
}
+ bpage = buf_page_alloc_descriptor();
+
page_zip_des_init(&bpage->zip);
page_zip_set_size(&bpage->zip, zip_size);
bpage->zip.data = data;
@@ -3706,9 +2949,11 @@ err_exit:
/* The block must be put to the LRU list, to the old blocks */
buf_LRU_add_block(bpage, TRUE/* to old blocks */);
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
mutex_enter(&flush_list_mutex);
buf_LRU_insert_zip_clean(bpage);
mutex_exit(&flush_list_mutex);
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
mutex_exit(&LRU_list_mutex);
@@ -3759,12 +3004,28 @@ buf_page_create(
free_block = buf_LRU_get_free_block();
+retry:
//buf_pool_mutex_enter();
mutex_enter(&LRU_list_mutex);
rw_lock_x_lock(&page_hash_latch);
block = (buf_block_t*) buf_page_hash_get(space, offset);
+ if (UNIV_UNLIKELY(block && block->page.space_was_being_deleted)) {
+ mutex_t* block_mutex = buf_page_get_mutex_enter((buf_page_t*)block);
+
+ /* This page is obsoleted, should discard and retry */
+ rw_lock_x_unlock(&page_hash_latch);
+ ut_a(block_mutex);
+
+ buf_LRU_free_block((buf_page_t*)block, TRUE, TRUE);
+
+ mutex_exit(&LRU_list_mutex);
+ mutex_exit(block_mutex);
+
+ goto retry;
+ }
+
if (block && buf_page_in_file(&block->page)) {
#ifdef UNIV_IBUF_COUNT_DEBUG
ut_a(ibuf_count_get(space, offset) == 0);
@@ -3889,8 +3150,7 @@ UNIV_INTERN
void
buf_page_io_complete(
/*=================*/
- buf_page_t* bpage, /*!< in: pointer to the block in question */
- trx_t* trx)
+ buf_page_t* bpage) /*!< in: pointer to the block in question */
{
enum buf_io_fix io_type;
const ibool uncompressed = (buf_page_get_state(bpage)
@@ -4011,14 +3271,18 @@ corrupt:
if (srv_pass_corrupt_table && !trx_sys_sys_space(bpage->space)
&& bpage->space < SRV_LOG_SPACE_FIRST_ID) {
+ trx_t* trx;
+
fprintf(stderr,
"InnoDB: space %u will be treated as corrupt.\n",
bpage->space);
fil_space_set_corrupt(bpage->space);
- if (trx && trx->dict_operation_lock_mode == 0) {
- dict_table_set_corrupt_by_space(bpage->space, TRUE);
- } else {
+
+ trx = innobase_get_trx();
+ if (trx && trx->dict_operation_lock_mode == RW_X_LATCH) {
dict_table_set_corrupt_by_space(bpage->space, FALSE);
+ } else {
+ dict_table_set_corrupt_by_space(bpage->space, TRUE);
}
bpage->is_corrupt = TRUE;
} else
@@ -4781,12 +4045,16 @@ buf_print_io(
/* Statistics about read ahead algorithm */
fprintf(file, "Pages read ahead %.2f/s,"
- " evicted without access %.2f/s\n",
+ " evicted without access %.2f/s,"
+ " Random read ahead %.2f/s\n",
(buf_pool->stat.n_ra_pages_read
- buf_pool->old_stat.n_ra_pages_read)
/ time_elapsed,
(buf_pool->stat.n_ra_pages_evicted
- buf_pool->old_stat.n_ra_pages_evicted)
+ / time_elapsed,
+ (buf_pool->stat.n_ra_pages_read_rnd
+ - buf_pool->old_stat.n_ra_pages_read_rnd)
/ time_elapsed);
/* Print some values to help us with visualizing what is
diff --git a/storage/xtradb/buf/buf0flu.c b/storage/xtradb/buf/buf0flu.c
index cda8d3b170e..0ea3ed29d2b 100644
--- a/storage/xtradb/buf/buf0flu.c
+++ b/storage/xtradb/buf/buf0flu.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -459,7 +459,9 @@ buf_flush_remove(
case BUF_BLOCK_ZIP_DIRTY:
buf_page_set_state(bpage, BUF_BLOCK_ZIP_PAGE);
UT_LIST_REMOVE(flush_list, buf_pool->flush_list, bpage);
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
buf_LRU_insert_zip_clean(bpage);
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
break;
case BUF_BLOCK_FILE_PAGE:
UT_LIST_REMOVE(flush_list, buf_pool->flush_list, bpage);
@@ -769,7 +771,7 @@ corrupted_page:
flush:
/* Now flush the doublewrite buffer data to disk */
- fil_flush(srv_doublewrite_file ? TRX_DOUBLEWRITE_SPACE : TRX_SYS_SPACE);
+ fil_flush(srv_doublewrite_file ? TRX_DOUBLEWRITE_SPACE : TRX_SYS_SPACE, FALSE);
/* We know that the writes have been flushed to disk now
and in recovery we will find them in the doublewrite buffer
@@ -1084,7 +1086,7 @@ buf_flush_page_try(
/*===============*/
buf_block_t* block) /*!< in/out: buffer control block */
{
- ut_ad(buf_pool_mutex_own());
+ //ut_ad(buf_pool_mutex_own());
ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
ut_ad(mutex_own(&block->mutex));
@@ -1092,8 +1094,11 @@ buf_flush_page_try(
return(FALSE);
}
+ buf_pool_mutex_enter();
+
if (buf_pool->n_flush[BUF_FLUSH_LRU] > 0
|| buf_pool->init_flush[BUF_FLUSH_LRU]) {
+ buf_pool_mutex_exit();
/* There is already a flush batch of the same type running */
return(FALSE);
}
diff --git a/storage/xtradb/buf/buf0lru.c b/storage/xtradb/buf/buf0lru.c
index 583eec9bd9c..df74ccf500f 100644
--- a/storage/xtradb/buf/buf0lru.c
+++ b/storage/xtradb/buf/buf0lru.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -259,18 +259,20 @@ buf_LRU_drop_page_hash_for_tablespace(
* BUF_LRU_DROP_SEARCH_HASH_SIZE);
//buf_pool_mutex_enter();
mutex_enter(&LRU_list_mutex);
+ num_entries = 0;
scan_again:
- num_entries = 0;
bpage = UT_LIST_GET_LAST(buf_pool->LRU);
while (bpage != NULL) {
+ /* bpage->state,space,io_fix,buf_fix_count are protected by block_mutex at XtraDB */
mutex_t* block_mutex = buf_page_get_mutex_enter(bpage);
buf_page_t* prev_bpage;
+ ibool is_fixed;
prev_bpage = UT_LIST_GET_PREV(LRU, bpage);
- if (!block_mutex) {
+ if (UNIV_UNLIKELY(!block_mutex)) {
goto next_page;
}
@@ -278,59 +280,77 @@ scan_again:
if (buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE
|| bpage->space != id
- || bpage->buf_fix_count > 0
|| bpage->io_fix != BUF_IO_NONE) {
- /* We leave the fixed pages as is in this scan.
- To be dealt with later in the final scan. */
+ /* Compressed pages are never hashed.
+ Skip blocks of other tablespaces.
+ Skip I/O-fixed blocks (to be dealt with later). */
mutex_exit(block_mutex);
- goto next_page;
+next_page:
+ bpage = prev_bpage;
+ continue;
}
- if (((buf_block_t*) bpage)->is_hashed) {
+ //mutex_enter(&((buf_block_t*) bpage)->mutex);
+ is_fixed = bpage->buf_fix_count > 0
+ || !((buf_block_t*) bpage)->is_hashed;
+ //mutex_exit(&((buf_block_t*) bpage)->mutex);
- /* Store the offset(i.e.: page_no) in the array
- so that we can drop hash index in a batch
- later. */
- page_arr[num_entries] = bpage->offset;
+ if (is_fixed) {
mutex_exit(block_mutex);
- ut_a(num_entries < BUF_LRU_DROP_SEARCH_HASH_SIZE);
- ++num_entries;
+ goto next_page;
+ }
- if (num_entries < BUF_LRU_DROP_SEARCH_HASH_SIZE) {
- goto next_page;
- }
- /* Array full. We release the buf_pool_mutex to
- obey the latching order. */
- //buf_pool_mutex_exit();
- mutex_exit(&LRU_list_mutex);
+ /* Store the page number so that we can drop the hash
+ index in a batch later. */
+ page_arr[num_entries] = bpage->offset;
+ mutex_exit(block_mutex);
- buf_LRU_drop_page_hash_batch(id, zip_size, page_arr,
- num_entries);
- num_entries = 0;
- //buf_pool_mutex_enter();
- mutex_enter(&LRU_list_mutex);
- } else {
- mutex_exit(block_mutex);
+ ut_a(num_entries < BUF_LRU_DROP_SEARCH_HASH_SIZE);
+ ++num_entries;
+
+ if (num_entries < BUF_LRU_DROP_SEARCH_HASH_SIZE) {
+ goto next_page;
}
-next_page:
- /* Note that we may have released the buf_pool mutex
- above after reading the prev_bpage during processing
- of a page_hash_batch (i.e.: when the array was full).
- This means that prev_bpage can change in LRU list.
- This is OK because this function is a 'best effort'
- to drop as many search hash entries as possible and
- it does not guarantee that ALL such entries will be
- dropped. */
- bpage = prev_bpage;
+ /* Array full. We release the buf_pool_mutex to
+ obey the latching order. */
+ //buf_pool_mutex_exit();
+ mutex_exit(&LRU_list_mutex);
+ buf_LRU_drop_page_hash_batch(id, zip_size, page_arr,
+ num_entries);
+ //buf_pool_mutex_enter();
+ mutex_enter(&LRU_list_mutex);
+ num_entries = 0;
+
+ /* Note that we released the buf_pool mutex above
+ after reading the prev_bpage during processing of a
+ page_hash_batch (i.e.: when the array was full).
+ Because prev_bpage could belong to a compressed-only
+ block, it may have been relocated, and thus the
+ pointer cannot be trusted. Because bpage is of type
+ buf_block_t, it is safe to dereference.
+
+ bpage can change in the LRU list. This is OK because
+ this function is a 'best effort' to drop as many
+ search hash entries as possible and it does not
+ guarantee that ALL such entries will be dropped. */
/* If, however, bpage has been removed from LRU list
to the free list then we should restart the scan.
bpage->state is protected by buf_pool mutex. */
- if (bpage && !buf_page_in_file(bpage)) {
- ut_a(num_entries == 0);
+
+ /* obtain block_mutex again to avoid race condition of bpage->state */
+ block_mutex = buf_page_get_mutex_enter(bpage);
+ if (!block_mutex) {
goto scan_again;
}
+
+ if (bpage
+ && buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE) {
+ mutex_exit(block_mutex);
+ goto scan_again;
+ }
+ mutex_exit(block_mutex);
}
//buf_pool_mutex_exit();
@@ -372,7 +392,7 @@ scan_again:
while (bpage != NULL) {
buf_page_t* prev_bpage;
- ibool prev_bpage_buf_fix = FALSE;
+ mutex_t* block_mutex = NULL;
ut_a(buf_page_in_file(bpage));
@@ -385,26 +405,28 @@ scan_again:
if (buf_page_get_space(bpage) != id) {
/* Skip this block, as it does not belong to
the space that is being invalidated. */
+ goto next_page;
} else if (buf_page_get_io_fix(bpage) != BUF_IO_NONE) {
/* We cannot remove this page during this scan
yet; maybe the system is currently reading it
in, or flushing the modifications to the file */
all_freed = FALSE;
+ goto next_page;
} else {
- mutex_t* block_mutex = buf_page_get_mutex_enter(bpage);
+ block_mutex = buf_page_get_mutex_enter(bpage);
if (!block_mutex) {
/* It may be impossible case...
Something wrong, so will be scan_again */
all_freed = FALSE;
-
- goto next_page_no_mutex;
+ goto next_page;
}
if (bpage->buf_fix_count > 0) {
+ mutex_exit(block_mutex);
/* We cannot remove this page during
this scan yet; maybe the system is
currently reading it in, or flushing
@@ -414,108 +436,61 @@ scan_again:
goto next_page;
}
+ }
+
+ ut_ad(mutex_own(block_mutex));
#ifdef UNIV_DEBUG
- if (buf_debug_prints) {
- fprintf(stderr,
- "Dropping space %lu page %lu\n",
- (ulong) buf_page_get_space(bpage),
- (ulong) buf_page_get_page_no(bpage));
- }
+ if (buf_debug_prints) {
+ fprintf(stderr,
+ "Dropping space %lu page %lu\n",
+ (ulong) buf_page_get_space(bpage),
+ (ulong) buf_page_get_page_no(bpage));
+ }
#endif
- if (buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE) {
- /* This is a compressed-only block
- descriptor. Ensure that prev_bpage
- cannot be relocated when bpage is freed. */
- if (UNIV_LIKELY(prev_bpage != NULL)) {
- switch (buf_page_get_state(
- prev_bpage)) {
- case BUF_BLOCK_FILE_PAGE:
- /* Descriptors of uncompressed
- blocks will not be relocated,
- because we are holding the
- buf_pool_mutex. */
- break;
- case BUF_BLOCK_ZIP_PAGE:
- case BUF_BLOCK_ZIP_DIRTY:
- /* Descriptors of compressed-
- only blocks can be relocated,
- unless they are buffer-fixed.
- Because both bpage and
- prev_bpage are protected by
- buf_pool_zip_mutex, it is
- not necessary to acquire
- further mutexes. */
- ut_ad(&buf_pool_zip_mutex
- == block_mutex);
- ut_ad(mutex_own(block_mutex));
- prev_bpage_buf_fix = TRUE;
- prev_bpage->buf_fix_count++;
- break;
- default:
- ut_error;
- }
- }
- } else if (((buf_block_t*) bpage)->is_hashed) {
- ulint page_no;
- ulint zip_size;
-
- //buf_pool_mutex_exit();
- mutex_exit(&LRU_list_mutex);
- rw_lock_x_unlock(&page_hash_latch);
+ if (buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE) {
+ /* This is a compressed-only block
+ descriptor. Do nothing. */
+ } else if (((buf_block_t*) bpage)->is_hashed) {
+ ulint page_no;
+ ulint zip_size;
- zip_size = buf_page_get_zip_size(bpage);
- page_no = buf_page_get_page_no(bpage);
+ //buf_pool_mutex_exit();
+ mutex_exit(&LRU_list_mutex);
+ rw_lock_x_unlock(&page_hash_latch);
- mutex_exit(block_mutex);
+ zip_size = buf_page_get_zip_size(bpage);
+ page_no = buf_page_get_page_no(bpage);
- /* Note that the following call will acquire
- an S-latch on the page */
+ mutex_exit(block_mutex);
- btr_search_drop_page_hash_when_freed(
- id, zip_size, page_no);
- goto scan_again;
- }
+ /* Note that the following call will acquire
+ an S-latch on the page */
- if (bpage->oldest_modification != 0) {
+ btr_search_drop_page_hash_when_freed(
+ id, zip_size, page_no);
+ goto scan_again;
+ }
- buf_flush_remove(bpage);
- }
+ if (bpage->oldest_modification != 0) {
- /* Remove from the LRU list. */
+ buf_flush_remove(bpage);
+ }
- if (buf_LRU_block_remove_hashed_page(bpage, TRUE)
- != BUF_BLOCK_ZIP_FREE) {
- buf_LRU_block_free_hashed_page((buf_block_t*)
- bpage, TRUE);
- } else {
- /* The block_mutex should have been
- released by buf_LRU_block_remove_hashed_page()
- when it returns BUF_BLOCK_ZIP_FREE. */
- ut_ad(block_mutex == &buf_pool_zip_mutex);
- ut_ad(!mutex_own(block_mutex));
-
- if (prev_bpage_buf_fix) {
- /* We temporarily buffer-fixed
- prev_bpage, so that
- buf_buddy_free() could not
- relocate it, in case it was a
- compressed-only block
- descriptor. */
-
- mutex_enter(block_mutex);
- ut_ad(prev_bpage->buf_fix_count > 0);
- prev_bpage->buf_fix_count--;
- mutex_exit(block_mutex);
- }
+ /* Remove from the LRU list. */
- goto next_page_no_mutex;
- }
-next_page:
+ if (buf_LRU_block_remove_hashed_page(bpage, TRUE)
+ != BUF_BLOCK_ZIP_FREE) {
+ buf_LRU_block_free_hashed_page((buf_block_t*) bpage, TRUE);
mutex_exit(block_mutex);
+ } else {
+ /* The block_mutex should have been released
+ by buf_LRU_block_remove_hashed_page() when it
+ returns BUF_BLOCK_ZIP_FREE. */
+ ut_ad(block_mutex == &buf_pool_zip_mutex);
+ ut_ad(!mutex_own(block_mutex));
}
-
-next_page_no_mutex:
+next_page:
bpage = prev_bpage;
}
@@ -539,6 +514,8 @@ buf_LRU_mark_space_was_deleted(
ulint id) /*!< in: space id */
{
buf_page_t* bpage;
+ buf_chunk_t* chunk;
+ ulint i, j;
mutex_enter(&LRU_list_mutex);
@@ -552,8 +529,32 @@ buf_LRU_mark_space_was_deleted(
}
mutex_exit(&LRU_list_mutex);
+
+ rw_lock_s_lock(&btr_search_latch);
+ chunk = buf_pool->chunks;
+ for (i = buf_pool->n_chunks; i--; chunk++) {
+ buf_block_t* block = chunk->blocks;
+ for (j = chunk->size; j--; block++) {
+ if (buf_block_get_state(block)
+ != BUF_BLOCK_FILE_PAGE
+ || !block->is_hashed
+ || buf_page_get_space(&block->page) != id) {
+ continue;
+ }
+
+ rw_lock_s_unlock(&btr_search_latch);
+
+ rw_lock_x_lock(&block->lock);
+ btr_search_drop_page_hash_index(block);
+ rw_lock_x_unlock(&block->lock);
+
+ rw_lock_s_lock(&btr_search_latch);
+ }
+ }
+ rw_lock_s_unlock(&btr_search_latch);
}
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
/********************************************************************//**
Insert a compressed block into buf_pool->zip_clean in the LRU order. */
UNIV_INTERN
@@ -587,6 +588,7 @@ buf_LRU_insert_zip_clean(
UT_LIST_ADD_FIRST(zip_list, buf_pool->zip_clean, bpage);
}
}
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
/******************************************************************//**
Try to free an uncompressed page of a compressed block from the unzip
@@ -629,7 +631,7 @@ restart:
UNIV_LIKELY(block != NULL) && UNIV_LIKELY(distance > 0);
block = UT_LIST_GET_PREV(unzip_LRU, block), distance--) {
- enum buf_lru_free_block_status freed;
+ ibool freed;
mutex_enter(&block->mutex);
if (!block->in_unzip_LRU_list || !block->page.in_LRU_list
@@ -645,24 +647,9 @@ restart:
freed = buf_LRU_free_block(&block->page, FALSE, have_LRU_mutex);
mutex_exit(&block->mutex);
- switch (freed) {
- case BUF_LRU_FREED:
+ if (freed) {
return(TRUE);
-
- case BUF_LRU_CANNOT_RELOCATE:
- /* If we failed to relocate, try
- regular LRU eviction. */
- return(FALSE);
-
- case BUF_LRU_NOT_FREED:
- /* The block was buffer-fixed or I/O-fixed.
- Keep looking. */
- continue;
}
-
- /* inappropriate return value from
- buf_LRU_free_block() */
- ut_error;
}
return(FALSE);
@@ -695,10 +682,9 @@ restart:
UNIV_LIKELY(bpage != NULL) && UNIV_LIKELY(distance > 0);
bpage = UT_LIST_GET_PREV(LRU, bpage), distance--) {
- enum buf_lru_free_block_status freed;
- unsigned accessed;
- mutex_t* block_mutex
- = buf_page_get_mutex_enter(bpage);
+ ibool freed;
+ unsigned accessed;
+ mutex_t* block_mutex = buf_page_get_mutex_enter(bpage);
if (!block_mutex) {
goto restart;
@@ -717,8 +703,7 @@ restart:
freed = buf_LRU_free_block(bpage, TRUE, have_LRU_mutex);
mutex_exit(block_mutex);
- switch (freed) {
- case BUF_LRU_FREED:
+ if (freed) {
/* Keep track of pages that are evicted without
ever being accessed. This gives us a measure of
the effectiveness of readahead */
@@ -726,21 +711,7 @@ restart:
++buf_pool->stat.n_ra_pages_evicted;
}
return(TRUE);
-
- case BUF_LRU_NOT_FREED:
- /* The block was dirty, buffer-fixed, or I/O-fixed.
- Keep looking. */
- continue;
-
- case BUF_LRU_CANNOT_RELOCATE:
- /* This should never occur, because we
- want to discard the compressed page too. */
- break;
}
-
- /* inappropriate return value from
- buf_LRU_free_block() */
- ut_error;
}
return(FALSE);
@@ -1457,17 +1428,16 @@ buf_LRU_make_block_old(
Try to free a block. If bpage is a descriptor of a compressed-only
page, the descriptor object will be freed as well.
-NOTE: If this function returns BUF_LRU_FREED, it will temporarily
+NOTE: If this function returns TRUE, it will temporarily
release buf_pool_mutex. Furthermore, the page frame will no longer be
accessible via bpage.
The caller must hold buf_pool_mutex and buf_page_get_mutex(bpage) and
release these two mutexes after the call. No other
buf_page_get_mutex() may be held when calling this function.
-@return BUF_LRU_FREED if freed, BUF_LRU_CANNOT_RELOCATE or
-BUF_LRU_NOT_FREED otherwise. */
+@return TRUE if freed, FALSE otherwise. */
UNIV_INTERN
-enum buf_lru_free_block_status
+ibool
buf_LRU_free_block(
/*===============*/
buf_page_t* bpage, /*!< in: block to be freed */
@@ -1493,7 +1463,7 @@ buf_LRU_free_block(
if (!bpage->in_LRU_list || !block_mutex || !buf_page_can_relocate(bpage)) {
/* Do not free buffer-fixed or I/O-fixed blocks. */
- return(BUF_LRU_NOT_FREED);
+ return(FALSE);
}
if (bpage->space_was_being_deleted && bpage->oldest_modification != 0) {
@@ -1509,7 +1479,7 @@ buf_LRU_free_block(
/* Do not completely free dirty blocks. */
if (bpage->oldest_modification) {
- return(BUF_LRU_NOT_FREED);
+ return(FALSE);
}
} else if (bpage->oldest_modification) {
/* Do not completely free dirty blocks. */
@@ -1517,7 +1487,7 @@ buf_LRU_free_block(
if (buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE) {
ut_ad(buf_page_get_state(bpage)
== BUF_BLOCK_ZIP_DIRTY);
- return(BUF_LRU_NOT_FREED);
+ return(FALSE);
}
goto alloc;
@@ -1526,14 +1496,8 @@ buf_LRU_free_block(
If it cannot be allocated (without freeing a block
from the LRU list), refuse to free bpage. */
alloc:
- //buf_pool_mutex_exit_forbid();
- b = buf_buddy_alloc(sizeof *b, NULL, FALSE);
- //buf_pool_mutex_exit_allow();
-
- if (UNIV_UNLIKELY(!b)) {
- return(BUF_LRU_CANNOT_RELOCATE);
- }
-
+ b = buf_page_alloc_descriptor();
+ ut_a(b);
//memcpy(b, bpage, sizeof *b);
}
@@ -1563,7 +1527,7 @@ not_freed:
if (!have_LRU_mutex)
mutex_exit(&LRU_list_mutex);
rw_lock_x_unlock(&page_hash_latch);
- return(BUF_LRU_NOT_FREED);
+ return(FALSE);
} else if (zip || !bpage->zip.data) {
if (bpage->oldest_modification)
goto not_freed;
@@ -1670,7 +1634,9 @@ not_freed:
mutex_enter(&flush_list_mutex);
if (b->state == BUF_BLOCK_ZIP_PAGE) {
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
buf_LRU_insert_zip_clean(b);
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
} else {
/* Relocate on buf_pool->flush_list. */
buf_flush_relocate_on_flush_list(bpage, b);
@@ -1746,7 +1712,7 @@ not_freed:
rw_lock_x_unlock(&page_hash_latch);
}
- return(BUF_LRU_FREED);
+ return(TRUE);
}
/******************************************************************//**
@@ -1967,15 +1933,16 @@ buf_LRU_block_remove_hashed_page(
ut_a(bpage->zip.data);
ut_a(buf_page_get_zip_size(bpage));
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
UT_LIST_REMOVE(zip_list, buf_pool->zip_clean, bpage);
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
mutex_exit(&buf_pool_zip_mutex);
//buf_pool_mutex_exit_forbid();
buf_buddy_free(bpage->zip.data,
page_zip_get_size(&bpage->zip), TRUE);
- buf_buddy_free(bpage, sizeof(*bpage), TRUE);
//buf_pool_mutex_exit_allow();
- UNIV_MEM_UNDESC(bpage);
+ buf_page_free_descriptor(bpage);
return(BUF_BLOCK_ZIP_FREE);
case BUF_BLOCK_FILE_PAGE:
@@ -2284,6 +2251,11 @@ buf_LRU_file_restore(void)
" InnoDB: cannot open %s\n", LRU_DUMP_FILE);
goto end;
}
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr, " InnoDB: Restoring buffer pool pages from %s\n",
+ LRU_DUMP_FILE);
+
if (size == 0 || size_high > 0 || size % 8) {
fprintf(stderr, " InnoDB: broken LRU dump file\n");
goto end;
@@ -2385,7 +2357,7 @@ buf_LRU_file_restore(void)
ut_print_timestamp(stderr);
fprintf(stderr,
- " InnoDB: reading pages based on the dumped LRU list was done."
+ " InnoDB: Completed reading buffer pool pages"
" (requested: %lu, read: %lu)\n", req, reads);
ret = TRUE;
end:
diff --git a/storage/xtradb/buf/buf0rea.c b/storage/xtradb/buf/buf0rea.c
index 59de70d9a8a..966a9b5d7a6 100644
--- a/storage/xtradb/buf/buf0rea.c
+++ b/storage/xtradb/buf/buf0rea.c
@@ -38,6 +38,14 @@ Created 11/5/1995 Heikki Tuuri
#include "srv0start.h"
#include "srv0srv.h"
+/** The size in blocks of the area where the random read-ahead algorithm counts
+the accessed pages when deciding whether to read-ahead */
+#define BUF_READ_AHEAD_RANDOM_AREA BUF_READ_AHEAD_AREA
+
+/** There must be at least this many pages in buf_pool in the area to start
+a random read-ahead */
+#define BUF_READ_AHEAD_RANDOM_THRESHOLD (5 + BUF_READ_AHEAD_RANDOM_AREA / 8)
+
/** The linear read-ahead area size */
#define BUF_READ_AHEAD_LINEAR_AREA BUF_READ_AHEAD_AREA
@@ -201,13 +209,179 @@ not_to_recover:
if (sync) {
/* The i/o is already completed when we arrive from
fil_read */
- buf_page_io_complete(bpage, trx);
+ buf_page_io_complete(bpage);
}
return(1);
}
/********************************************************************//**
+Applies a random read-ahead in buf_pool if there are at least a threshold
+value of accessed pages from the random read-ahead area. Does not read any
+page, not even the one at the position (space, offset), if the read-ahead
+mechanism is not activated. NOTE 1: the calling thread may own latches on
+pages: to avoid deadlocks this function must be written such that it cannot
+end up waiting for these latches! NOTE 2: the calling thread must want
+access to the page given: this rule is set to prevent unintended read-aheads
+performed by ibuf routines, a situation which could result in a deadlock if
+the OS does not support asynchronous i/o.
+@return number of page read requests issued; NOTE that if we read ibuf
+pages, it may happen that the page at the given page number does not
+get read even if we return a positive value! */
+static
+ulint
+buf_read_ahead_random(
+/*==================*/
+ ulint space, /*!< in: space id */
+ ulint zip_size,/*!< in: compressed page size in bytes, or 0 */
+ ulint offset, /*!< in: page number of a page which the current thread
+ wants to access */
+ trx_t* trx)
+{
+ ib_int64_t tablespace_version;
+ ulint recent_blocks = 0;
+ ulint count;
+ ulint ibuf_mode;
+ ulint low, high;
+ ulint err;
+ ulint i;
+ ulint buf_read_ahead_random_area;
+
+ if (!srv_random_read_ahead) {
+ /* Disabled by user */
+ return(0);
+ }
+
+ if (srv_startup_is_before_trx_rollback_phase) {
+ /* No read-ahead to avoid thread deadlocks */
+ return(0);
+ }
+
+ if (ibuf_bitmap_page(zip_size, offset)
+ || trx_sys_hdr_page(space, offset)) {
+
+ /* If it is an ibuf bitmap page or trx sys hdr, we do
+ no read-ahead, as that could break the ibuf page access
+ order */
+
+ return(0);
+ }
+
+ /* Remember the tablespace version before we ask the tablespace size
+ below: if DISCARD + IMPORT changes the actual .ibd file meanwhile, we
+ do not try to read outside the bounds of the tablespace! */
+
+ tablespace_version = fil_space_get_version(space);
+
+ buf_read_ahead_random_area = BUF_READ_AHEAD_RANDOM_AREA;
+
+ low = (offset / buf_read_ahead_random_area)
+ * buf_read_ahead_random_area;
+ high = (offset / buf_read_ahead_random_area + 1)
+ * buf_read_ahead_random_area;
+ if (high > fil_space_get_size(space)) {
+
+ high = fil_space_get_size(space);
+ }
+
+ //buf_pool_mutex_enter();
+ mutex_enter(&buf_pool_mutex);
+
+ if (buf_pool->n_pend_reads
+ > buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) {
+ //buf_pool_mutex_exit();
+ mutex_exit(&buf_pool_mutex);
+
+ return(0);
+ }
+ mutex_exit(&buf_pool_mutex);
+
+ /* Count how many blocks in the area have been recently accessed,
+ that is, reside near the start of the LRU list. */
+
+ rw_lock_s_lock(&page_hash_latch);
+ for (i = low; i < high; i++) {
+ const buf_page_t* bpage = buf_page_hash_get(space, i);
+
+ if (bpage
+ && buf_page_is_accessed(bpage)
+ && buf_page_peek_if_young(bpage)) {
+
+ recent_blocks++;
+
+ if (recent_blocks >= BUF_READ_AHEAD_RANDOM_THRESHOLD) {
+
+ //buf_pool_mutex_exit();
+ rw_lock_s_unlock(&page_hash_latch);
+ goto read_ahead;
+ }
+ }
+ }
+
+ //buf_pool_mutex_exit();
+ rw_lock_s_unlock(&page_hash_latch);
+ /* Do nothing */
+ return(0);
+
+read_ahead:
+ /* Read all the suitable blocks within the area */
+
+ if (ibuf_inside()) {
+ ibuf_mode = BUF_READ_IBUF_PAGES_ONLY;
+ } else {
+ ibuf_mode = BUF_READ_ANY_PAGE;
+ }
+
+ count = 0;
+
+ for (i = low; i < high; i++) {
+ /* It is only sensible to do read-ahead in the non-sync aio
+ mode: hence FALSE as the first parameter */
+
+ if (!ibuf_bitmap_page(zip_size, i)) {
+ count += buf_read_page_low(
+ &err, FALSE,
+ ibuf_mode | OS_AIO_SIMULATED_WAKE_LATER,
+ space, zip_size, FALSE,
+ tablespace_version, i, trx);
+ if (err == DB_TABLESPACE_DELETED) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Warning: in random"
+ " readahead trying to access\n"
+ "InnoDB: tablespace %lu page %lu,\n"
+ "InnoDB: but the tablespace does not"
+ " exist or is just being dropped.\n",
+ (ulong) space, (ulong) i);
+ }
+ }
+ }
+
+ /* In simulated aio we wake the aio handler threads only after
+ queuing all aio requests, in native aio the following call does
+ nothing: */
+
+ os_aio_simulated_wake_handler_threads();
+
+#ifdef UNIV_DEBUG
+ if (buf_debug_prints && (count > 0)) {
+ fprintf(stderr,
+ "Random read-ahead space %lu offset %lu pages %lu\n",
+ (ulong) space, (ulong) offset,
+ (ulong) count);
+ }
+#endif /* UNIV_DEBUG */
+
+ /* Read ahead is considered one I/O operation for the purpose of
+ LRU policy decision. */
+ buf_LRU_stat_inc_io();
+
+ buf_pool->stat.n_ra_pages_read_rnd += count;
+ return(count);
+}
+
+
+/********************************************************************//**
High-level function which reads a page asynchronously from a file to the
buffer buf_pool if it is not already there. Sets the io_fix flag and sets
an exclusive lock on the buffer frame. The flag is cleared and the x-lock
@@ -226,6 +400,9 @@ buf_read_page(
ulint count;
ulint err;
+ count = buf_read_ahead_random(space, zip_size, offset, trx);
+ srv_buf_pool_reads += count;
+
tablespace_version = fil_space_get_version(space);
/* We do the i/o in the synchronous aio mode to save thread
diff --git a/storage/xtradb/dict/dict0dict.c b/storage/xtradb/dict/dict0dict.c
index 8d4bd76c32c..6b7e0bcffd2 100644
--- a/storage/xtradb/dict/dict0dict.c
+++ b/storage/xtradb/dict/dict0dict.c
@@ -4527,6 +4527,8 @@ dict_store_statistics(
break;
}
+ btr_pcur_store_position(&pcur, &mtr);
+
if (rec_get_deleted_flag(rec, 0)) {
/* don't count */
i--;
@@ -4567,6 +4569,10 @@ dict_store_statistics(
rests--;
next_rec:
+ mtr_commit(&mtr);
+ mtr_start(&mtr);
+ btr_pcur_restore_position(BTR_MODIFY_LEAF, &pcur, &mtr);
+
btr_pcur_move_to_next_user_rec(&pcur, &mtr);
}
btr_pcur_close(&pcur);
@@ -4657,6 +4663,7 @@ dict_update_statistics(
do {
if (table->is_corrupt) {
ut_a(srv_pass_corrupt_table);
+ dict_table_stats_unlock(table, RW_X_LATCH);
return;
}
diff --git a/storage/xtradb/dict/dict0load.c b/storage/xtradb/dict/dict0load.c
index edd77e2530f..64d7fad3557 100644
--- a/storage/xtradb/dict/dict0load.c
+++ b/storage/xtradb/dict/dict0load.c
@@ -554,9 +554,10 @@ dict_load_columns(
}
/********************************************************************//**
-Loads definitions for index fields. */
+Loads definitions for index fields.
+@return DB_SUCCESS if ok, DB_CORRUPTION if failed */
static
-void
+ulint
dict_load_fields(
/*=============*/
dict_index_t* index, /*!< in: index whose fields to load */
@@ -575,6 +576,7 @@ dict_load_fields(
byte* buf;
ulint i;
mtr_t mtr;
+ ulint error = DB_SUCCESS;
ut_ad(mutex_own(&(dict_sys->mutex)));
@@ -641,6 +643,26 @@ dict_load_fields(
field = rec_get_nth_field_old(rec, 4, &len);
+ if (prefix_len >= DICT_MAX_INDEX_COL_LEN) {
+ fprintf(stderr, "InnoDB: Error: load index"
+ " '%s' failed.\n"
+ "InnoDB: index field '%s' has a prefix"
+ " length of %lu bytes,\n"
+ "InnoDB: which exceeds the"
+ " maximum limit of %lu bytes.\n"
+ "InnoDB: Please use server that"
+ " supports long index prefix\n"
+ "InnoDB: or turn on"
+ " innodb_force_recovery to load"
+ " the table\n",
+ index->name, mem_heap_strdupl(
+ heap, (char*) field, len),
+ (ulong) prefix_len,
+ (ulong) (DICT_MAX_INDEX_COL_LEN - 1));
+ error = DB_CORRUPTION;
+ goto func_exit;
+ }
+
dict_mem_index_add_field(index,
mem_heap_strdupl(heap,
(char*) field, len),
@@ -650,8 +672,10 @@ next_rec:
btr_pcur_move_to_next_user_rec(&pcur, &mtr);
}
+func_exit:
btr_pcur_close(&pcur);
mtr_commit(&mtr);
+ return(error);
}
/********************************************************************//**
@@ -802,7 +826,25 @@ dict_load_indexes(
space, type, n_fields);
index->id = id;
- dict_load_fields(index, heap);
+ error = dict_load_fields(index, heap);
+
+ if (error != DB_SUCCESS) {
+ fprintf(stderr, "InnoDB: Error: load index '%s'"
+ " for table '%s' failed\n",
+ index->name, table->name);
+
+ /* If the force recovery flag is set, and
+ if the failed index is not the primary index, we
+ will continue and open other indexes */
+ if (srv_force_recovery
+ && !(index->type & DICT_CLUSTERED)) {
+ error = DB_SUCCESS;
+ goto next_rec;
+ } else {
+ goto func_exit;
+ }
+ }
+
error = dict_index_add_to_cache(table, index, page_no,
FALSE);
/* The data dictionary tables should never contain
@@ -1028,9 +1070,18 @@ err_exit:
} else {
table->fk_max_recusive_level = 0;
}
- } else if (!srv_force_recovery) {
- dict_table_remove_from_cache(table);
- table = NULL;
+ } else {
+ dict_index_t* index;
+
+ /* Make sure that at least the clustered index was loaded.
+ Otherwise refuse to load the table */
+ index = dict_table_get_first_index(table);
+
+ if (!srv_force_recovery || !index
+ || !(index->type & DICT_CLUSTERED)) {
+ dict_table_remove_from_cache(table);
+ table = NULL;
+ }
}
#if 0
if (err != DB_SUCCESS && table != NULL) {
diff --git a/storage/xtradb/dict/dict0mem.c b/storage/xtradb/dict/dict0mem.c
index f2d219bfd4f..c3da053c2de 100644
--- a/storage/xtradb/dict/dict0mem.c
+++ b/storage/xtradb/dict/dict0mem.c
@@ -36,6 +36,9 @@ Created 1/8/1996 Heikki Tuuri
#ifndef UNIV_HOTBACKUP
# include "lock0lock.h"
#endif /* !UNIV_HOTBACKUP */
+#ifdef UNIV_BLOB_DEBUG
+# include "ut0rbt.h"
+#endif /* UNIV_BLOB_DEBUG */
#define DICT_HEAP_SIZE 100 /*!< initial memory heap size when
creating a table or index object */
@@ -318,6 +321,12 @@ dict_mem_index_free(
{
ut_ad(index);
ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+#ifdef UNIV_BLOB_DEBUG
+ if (index->blobs) {
+ mutex_free(&index->blobs_mutex);
+ rbt_free(index->blobs);
+ }
+#endif /* UNIV_BLOB_DEBUG */
mem_heap_free(index->heap);
}
diff --git a/storage/xtradb/fil/fil0fil.c b/storage/xtradb/fil/fil0fil.c
index 3c535bf7331..17ca4cb1745 100644
--- a/storage/xtradb/fil/fil0fil.c
+++ b/storage/xtradb/fil/fil0fil.c
@@ -46,6 +46,8 @@ Created 10/25/1995 Heikki Tuuri
#include "row0mysql.h"
#include "row0row.h"
#include "que0que.h"
+#include "btr0btr.h"
+#include "btr0sea.h"
#ifndef UNIV_HOTBACKUP
# include "buf0lru.h"
# include "ibuf0ibuf.h"
@@ -817,7 +819,7 @@ fil_node_close_file(
ut_ad(node && system);
ut_ad(mutex_own(&(system->mutex)));
ut_a(node->open);
- ut_a(node->n_pending == 0 || srv_lazy_drop_table);
+ ut_a(node->n_pending == 0 || node->space->is_being_deleted);
ut_a(node->n_pending_flushes == 0);
ut_a(node->modification_counter == node->flush_counter);
@@ -830,7 +832,7 @@ fil_node_close_file(
ut_a(system->n_open > 0);
system->n_open--;
- if (node->space->purpose == FIL_TABLESPACE && !trx_sys_sys_space(node->space->id)) {
+ if (node->n_pending == 0 && node->space->purpose == FIL_TABLESPACE && !trx_sys_sys_space(node->space->id)) {
ut_a(UT_LIST_GET_LEN(system->LRU) > 0);
/* The node is in the LRU list, remove it */
@@ -1029,7 +1031,7 @@ fil_node_free(
ut_ad(node && system && space);
ut_ad(mutex_own(&(system->mutex)));
ut_a(node->magic_n == FIL_NODE_MAGIC_N);
- ut_a(node->n_pending == 0 || srv_lazy_drop_table);
+ ut_a(node->n_pending == 0 || space->is_being_deleted);
if (node->open) {
/* We fool the assertion in fil_node_close_file() to think
@@ -2579,7 +2581,7 @@ retry:
os_thread_sleep(20000);
- fil_flush(id);
+ fil_flush(id, TRUE);
goto retry;
@@ -2792,7 +2794,7 @@ error_exit2:
goto error_exit;
}
- ret = os_file_flush(file);
+ ret = os_file_flush(file, TRUE);
if (!ret) {
fputs("InnoDB: Error: file flush of tablespace ", stderr);
@@ -2977,7 +2979,7 @@ fil_reset_too_high_lsns(
}
}
- success = os_file_flush(file);
+ success = os_file_flush(file, TRUE);
if (!success) {
goto func_exit;
@@ -2999,7 +3001,7 @@ fil_reset_too_high_lsns(
goto func_exit;
}
- success = os_file_flush(file);
+ success = os_file_flush(file, TRUE);
func_exit:
os_file_close(file);
ut_free(buf2);
@@ -3009,6 +3011,97 @@ func_exit:
}
/********************************************************************//**
+Checks if a page is corrupt. (for offline page)
+*/
+static
+ibool
+fil_page_buf_page_is_corrupted_offline(
+/*===================================*/
+ const byte* page, /*!< in: a database page */
+ ulint zip_size) /*!< in: size of compressed page;
+ 0 for uncompressed pages */
+{
+ ulint checksum_field;
+ ulint old_checksum_field;
+
+ if (!zip_size
+ && memcmp(page + FIL_PAGE_LSN + 4,
+ page + UNIV_PAGE_SIZE
+ - FIL_PAGE_END_LSN_OLD_CHKSUM + 4, 4)) {
+ return(TRUE);
+ }
+
+ checksum_field = mach_read_from_4(page
+ + FIL_PAGE_SPACE_OR_CHKSUM);
+
+ if (zip_size) {
+ return(checksum_field != BUF_NO_CHECKSUM_MAGIC
+ && checksum_field
+ != page_zip_calc_checksum(page, zip_size));
+ }
+
+ old_checksum_field = mach_read_from_4(
+ page + UNIV_PAGE_SIZE
+ - FIL_PAGE_END_LSN_OLD_CHKSUM);
+
+ if (old_checksum_field != mach_read_from_4(page
+ + FIL_PAGE_LSN)
+ && old_checksum_field != BUF_NO_CHECKSUM_MAGIC
+ && old_checksum_field
+ != buf_calc_page_old_checksum(page)) {
+ return(TRUE);
+ }
+
+ if (!srv_fast_checksum
+ && checksum_field != 0
+ && checksum_field != BUF_NO_CHECKSUM_MAGIC
+ && checksum_field
+ != buf_calc_page_new_checksum(page)) {
+ return(TRUE);
+ }
+
+ if (srv_fast_checksum
+ && checksum_field != 0
+ && checksum_field != BUF_NO_CHECKSUM_MAGIC
+ && checksum_field
+ != buf_calc_page_new_checksum_32(page)
+ && checksum_field
+ != buf_calc_page_new_checksum(page)) {
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/********************************************************************//**
+*/
+static
+void
+fil_page_buf_page_store_checksum(
+/*=============================*/
+ byte* page,
+ ulint zip_size)
+{
+ if (!zip_size) {
+ mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
+ srv_use_checksums
+ ? (!srv_fast_checksum
+ ? buf_calc_page_new_checksum(page)
+ : buf_calc_page_new_checksum_32(page))
+ : BUF_NO_CHECKSUM_MAGIC);
+ mach_write_to_4(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
+ srv_use_checksums
+ ? buf_calc_page_old_checksum(page)
+ : BUF_NO_CHECKSUM_MAGIC);
+ } else {
+ mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
+ srv_use_checksums
+ ? page_zip_calc_checksum(page, zip_size)
+ : BUF_NO_CHECKSUM_MAGIC);
+ }
+}
+
+/********************************************************************//**
Tries to open a single-table tablespace and optionally checks the space id is
right in it. If does not succeed, prints an error message to the .err log. This
function is used to open a tablespace when we start up mysqld, and also in
@@ -3123,6 +3216,7 @@ fil_open_single_table_tablespace(
fil_system_t* system;
fil_node_t* node = NULL;
fil_space_t* space;
+ ulint zip_size;
buf3 = ut_malloc(2 * UNIV_PAGE_SIZE);
descr_page = ut_align(buf3, UNIV_PAGE_SIZE);
@@ -3140,12 +3234,15 @@ fil_open_single_table_tablespace(
/* store as first descr page */
memcpy(descr_page, page, UNIV_PAGE_SIZE);
+ zip_size = dict_table_flags_to_zip_size(flags);
+ ut_a(zip_size == dict_table_flags_to_zip_size(space_flags));
+
/* get free limit (page number) of the table space */
/* these should be same to the definition in fsp0fsp.c */
#define FSP_HEADER_OFFSET FIL_PAGE_DATA
#define FSP_FREE_LIMIT 12
free_limit = mach_read_from_4(FSP_HEADER_OFFSET + FSP_FREE_LIMIT + page);
- free_limit_bytes = (ib_int64_t)free_limit * (ib_int64_t)UNIV_PAGE_SIZE;
+ free_limit_bytes = (ib_int64_t)free_limit * (ib_int64_t)(zip_size ? zip_size : UNIV_PAGE_SIZE);
/* overwrite fsp header */
fsp_header_init_fields(page, id, flags);
@@ -3154,16 +3251,9 @@ fil_open_single_table_tablespace(
space_flags = flags;
if (mach_read_ull(page + FIL_PAGE_FILE_FLUSH_LSN) > current_lsn)
mach_write_ull(page + FIL_PAGE_FILE_FLUSH_LSN, current_lsn);
- mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
- srv_use_checksums
- ? (!srv_fast_checksum
- ? buf_calc_page_new_checksum(page)
- : buf_calc_page_new_checksum_32(page))
- : BUF_NO_CHECKSUM_MAGIC);
- mach_write_to_4(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
- srv_use_checksums
- ? buf_calc_page_old_checksum(page)
- : BUF_NO_CHECKSUM_MAGIC);
+
+ fil_page_buf_page_store_checksum(page, zip_size);
+
success = os_file_write(filepath, file, page, 0, 0, UNIV_PAGE_SIZE);
/* get file size */
@@ -3173,7 +3263,7 @@ fil_open_single_table_tablespace(
if (size_bytes < free_limit_bytes) {
free_limit_bytes = size_bytes;
- if (size_bytes >= (ib_int64_t) (FSP_EXTENT_SIZE * UNIV_PAGE_SIZE)) {
+ if (size_bytes >= (ib_int64_t) (FSP_EXTENT_SIZE * (zip_size ? zip_size : UNIV_PAGE_SIZE))) {
fprintf(stderr, "InnoDB: free limit of %s is larger than its real size.\n", filepath);
file_is_corrupt = TRUE;
}
@@ -3237,75 +3327,41 @@ skip_info:
size_bytes = ut_2pow_round(size_bytes, 1024 * 1024);
}
*/
- if (!(flags & DICT_TF_ZSSIZE_MASK)) {
+
+ if (zip_size) {
+ fprintf(stderr, "InnoDB: Warning: importing compressed table is still EXPERIMENTAL, currently.\n");
+ }
+
+ {
mem_heap_t* heap = NULL;
ulint offsets_[REC_OFFS_NORMAL_SIZE];
ulint* offsets = offsets_;
ib_int64_t offset;
- size = (ulint) (size_bytes / UNIV_PAGE_SIZE);
+ size = (ulint) (size_bytes / (zip_size ? zip_size : UNIV_PAGE_SIZE));
/* over write space id of all pages */
rec_offs_init(offsets_);
fprintf(stderr, "InnoDB: Progress in %%:");
- for (offset = 0; offset < free_limit_bytes; offset += UNIV_PAGE_SIZE) {
- ulint checksum_field;
- ulint old_checksum_field;
+ for (offset = 0; offset < free_limit_bytes;
+ offset += zip_size ? zip_size : UNIV_PAGE_SIZE) {
ibool page_is_corrupt;
success = os_file_read(file, page,
(ulint)(offset & 0xFFFFFFFFUL),
- (ulint)(offset >> 32), UNIV_PAGE_SIZE);
+ (ulint)(offset >> 32),
+ zip_size ? zip_size : UNIV_PAGE_SIZE);
page_is_corrupt = FALSE;
/* check consistency */
- if (memcmp(page + FIL_PAGE_LSN + 4,
- page + UNIV_PAGE_SIZE
- - FIL_PAGE_END_LSN_OLD_CHKSUM + 4, 4)) {
-
+ if (fil_page_buf_page_is_corrupted_offline(page, zip_size)) {
page_is_corrupt = TRUE;
}
if (mach_read_from_4(page + FIL_PAGE_OFFSET)
- != offset / UNIV_PAGE_SIZE) {
-
- page_is_corrupt = TRUE;
- }
-
- checksum_field = mach_read_from_4(page
- + FIL_PAGE_SPACE_OR_CHKSUM);
-
- old_checksum_field = mach_read_from_4(
- page + UNIV_PAGE_SIZE
- - FIL_PAGE_END_LSN_OLD_CHKSUM);
-
- if (old_checksum_field != mach_read_from_4(page
- + FIL_PAGE_LSN)
- && old_checksum_field != BUF_NO_CHECKSUM_MAGIC
- && old_checksum_field
- != buf_calc_page_old_checksum(page)) {
-
- page_is_corrupt = TRUE;
- }
-
- if (!srv_fast_checksum
- && checksum_field != 0
- && checksum_field != BUF_NO_CHECKSUM_MAGIC
- && checksum_field
- != buf_calc_page_new_checksum(page)) {
-
- page_is_corrupt = TRUE;
- }
-
- if (srv_fast_checksum
- && checksum_field != 0
- && checksum_field != BUF_NO_CHECKSUM_MAGIC
- && checksum_field
- != buf_calc_page_new_checksum_32(page)
- && checksum_field
- != buf_calc_page_new_checksum(page)) {
+ != offset / (zip_size ? zip_size : UNIV_PAGE_SIZE)) {
page_is_corrupt = TRUE;
}
@@ -3316,7 +3372,8 @@ skip_info:
/* it should be overwritten already */
ut_a(!page_is_corrupt);
- } else if (!((offset / UNIV_PAGE_SIZE) % UNIV_PAGE_SIZE)) {
+ } else if (!((offset / (zip_size ? zip_size : UNIV_PAGE_SIZE))
+ % (zip_size ? zip_size : UNIV_PAGE_SIZE))) {
/* descr page (not header) */
if (page_is_corrupt) {
file_is_corrupt = TRUE;
@@ -3327,7 +3384,7 @@ skip_info:
}
/* store as descr page */
- memcpy(descr_page, page, UNIV_PAGE_SIZE);
+ memcpy(descr_page, page, (zip_size ? zip_size : UNIV_PAGE_SIZE));
} else if (descr_is_corrupt) {
/* unknown state of the page */
@@ -3355,9 +3412,12 @@ skip_info:
ulint bit_index;
descr = descr_page + XDES_ARR_OFFSET
- + XDES_SIZE * (ut_2pow_remainder((offset / UNIV_PAGE_SIZE), UNIV_PAGE_SIZE) / FSP_EXTENT_SIZE);
+ + XDES_SIZE * (ut_2pow_remainder(
+ (offset / (zip_size ? zip_size : UNIV_PAGE_SIZE)),
+ (zip_size ? zip_size : UNIV_PAGE_SIZE)) / FSP_EXTENT_SIZE);
- index = XDES_FREE_BIT + XDES_BITS_PER_PAGE * ((offset / UNIV_PAGE_SIZE) % FSP_EXTENT_SIZE);
+ index = XDES_FREE_BIT
+ + XDES_BITS_PER_PAGE * ((offset / (zip_size ? zip_size : UNIV_PAGE_SIZE)) % FSP_EXTENT_SIZE);
byte_index = index / 8;
bit_index = index % 8;
@@ -3375,7 +3435,7 @@ skip_info:
}
if (page_is_corrupt) {
- fprintf(stderr, " [errp:%ld]", (long) (offset / UNIV_PAGE_SIZE));
+ fprintf(stderr, " [errp:%ld]", (long) (offset / (zip_size ? zip_size : UNIV_PAGE_SIZE)));
/* cannot treat corrupt page */
goto skip_write;
@@ -3385,7 +3445,13 @@ skip_info:
mach_write_to_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, id);
for (i = 0; (ulint) i < n_index; i++) {
- if ((ulint) (offset / UNIV_PAGE_SIZE) == root_page[i]) {
+ if ((ulint) (offset / (zip_size ? zip_size : UNIV_PAGE_SIZE)) == root_page[i]) {
+ if (fil_page_get_type(page) != FIL_PAGE_INDEX) {
+ file_is_corrupt = TRUE;
+ fprintf(stderr, " [etyp:%ld]",
+ (long) (offset / (zip_size ? zip_size : UNIV_PAGE_SIZE)));
+ goto skip_write;
+ }
/* this is index root page */
mach_write_to_4(page + FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF
+ FSEG_HDR_SPACE, id);
@@ -3398,7 +3464,14 @@ skip_info:
if (fil_page_get_type(page) == FIL_PAGE_INDEX) {
dulint tmp = mach_read_from_8(page + (PAGE_HEADER + PAGE_INDEX_ID));
- if (mach_read_from_2(page + PAGE_HEADER + PAGE_LEVEL) == 0
+ for (i = 0; i < n_index; i++) {
+ if (ut_dulint_cmp(old_id[i], tmp) == 0) {
+ mach_write_to_8(page + (PAGE_HEADER + PAGE_INDEX_ID), new_id[i]);
+ break;
+ }
+ }
+
+ if (!zip_size && mach_read_from_2(page + PAGE_HEADER + PAGE_LEVEL) == 0
&& ut_dulint_cmp(old_id[0], tmp) == 0) {
/* leaf page of cluster index, reset trx_id of records */
rec_t* rec;
@@ -3417,7 +3490,7 @@ skip_info:
ULINT_UNDEFINED, &heap);
n_fields = rec_offs_n_fields(offsets);
if (!offset) {
- offset = row_get_trx_id_offset(rec, index, offsets);
+ offset = row_get_trx_id_offset(index, offsets);
}
trx_write_trx_id(rec + offset, ut_dulint_create(0, 1));
@@ -3437,44 +3510,34 @@ skip_info:
rec = page_rec_get_next(rec);
n_recs--;
}
- }
-
- for (i = 0; i < n_index; i++) {
- if (ut_dulint_cmp(old_id[i], tmp) == 0) {
- mach_write_to_8(page + (PAGE_HEADER + PAGE_INDEX_ID), new_id[i]);
- break;
- }
+ } else if (mach_read_from_2(page + PAGE_HEADER + PAGE_LEVEL) == 0
+ && ut_dulint_cmp(old_id[0], tmp) != 0) {
+ mach_write_to_8(page + (PAGE_HEADER + PAGE_MAX_TRX_ID), ut_dulint_create(0, 1));
}
}
if (mach_read_ull(page + FIL_PAGE_LSN) > current_lsn) {
mach_write_ull(page + FIL_PAGE_LSN, current_lsn);
- mach_write_ull(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
- current_lsn);
+ if (!zip_size) {
+ mach_write_ull(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
+ current_lsn);
+ }
}
- mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
- srv_use_checksums
- ? (!srv_fast_checksum
- ? buf_calc_page_new_checksum(page)
- : buf_calc_page_new_checksum_32(page))
- : BUF_NO_CHECKSUM_MAGIC);
- mach_write_to_4(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
- srv_use_checksums
- ? buf_calc_page_old_checksum(page)
- : BUF_NO_CHECKSUM_MAGIC);
+ fil_page_buf_page_store_checksum(page, zip_size);
success = os_file_write(filepath, file, page,
(ulint)(offset & 0xFFFFFFFFUL),
- (ulint)(offset >> 32), UNIV_PAGE_SIZE);
+ (ulint)(offset >> 32),
+ zip_size ? zip_size : UNIV_PAGE_SIZE);
}
skip_write:
if (free_limit_bytes
- && ((ib_int64_t)((offset + UNIV_PAGE_SIZE) * 100) / free_limit_bytes)
+ && ((ib_int64_t)((offset + (zip_size ? zip_size : UNIV_PAGE_SIZE)) * 100) / free_limit_bytes)
!= ((offset * 100) / free_limit_bytes)) {
fprintf(stderr, " %lu",
- (ulong)((ib_int64_t)((offset + UNIV_PAGE_SIZE) * 100) / free_limit_bytes));
+ (ulong)((ib_int64_t)((offset + (zip_size ? zip_size : UNIV_PAGE_SIZE)) * 100) / free_limit_bytes));
}
}
@@ -3530,13 +3593,6 @@ skip_write:
if (UNIV_LIKELY_NULL(heap)) {
mem_heap_free(heap);
}
- } else {
- /* zip page? */
- size = (ulint)
- (size_bytes
- / dict_table_flags_to_zip_size(flags));
- fprintf(stderr, "InnoDB: import: table %s seems to be in newer format."
- " It may not be able to treated for now.\n", name);
}
/* .exp file should be removed */
success = os_file_delete(info_file_path);
@@ -3618,6 +3674,269 @@ func_exit:
os_file_close(file);
mem_free(filepath);
+ if (srv_expand_import && dict_table_flags_to_zip_size(flags)) {
+ ulint page_no;
+ ulint zip_size;
+ ulint height;
+ rec_t* node_ptr;
+ dict_table_t* table;
+ dict_index_t* index;
+ buf_block_t* block;
+ page_t* page;
+ page_zip_des_t* page_zip;
+ mtr_t mtr;
+
+ mem_heap_t* heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ ulint* offsets = offsets_;
+
+ rec_offs_init(offsets_);
+
+ zip_size = dict_table_flags_to_zip_size(flags);
+
+ table = dict_table_get_low(name);
+ index = dict_table_get_first_index(table);
+ page_no = dict_index_get_page(index);
+ ut_a(page_no == 3);
+
+ fprintf(stderr, "InnoDB: It is compressed .ibd file. need to convert additionaly on buffer pool.\n");
+
+ /* down to leaf */
+ mtr_start(&mtr);
+ mtr_set_log_mode(&mtr, MTR_LOG_NONE);
+
+ height = ULINT_UNDEFINED;
+
+ for (;;) {
+ block = buf_page_get(space_id, zip_size, page_no,
+ RW_NO_LATCH, &mtr);
+ page = buf_block_get_frame(block);
+
+ block->check_index_page_at_flush = TRUE;
+
+ if (height == ULINT_UNDEFINED) {
+ height = btr_page_get_level(page, &mtr);
+ }
+
+ if (height == 0) {
+ break;
+ }
+
+ node_ptr = page_rec_get_next(page_get_infimum_rec(page));
+
+ height--;
+
+ offsets = rec_get_offsets(node_ptr, index, offsets, ULINT_UNDEFINED, &heap);
+ page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets);
+ }
+
+ mtr_commit(&mtr);
+
+ fprintf(stderr, "InnoDB: pages needs split are ...");
+
+ /* scan reaf pages */
+ while (page_no != FIL_NULL) {
+ rec_t* rec;
+ rec_t* supremum;
+ ulint n_recs;
+
+ mtr_start(&mtr);
+
+ block = buf_page_get(space_id, zip_size, page_no,
+ RW_X_LATCH, &mtr);
+ page = buf_block_get_frame(block);
+ page_zip = buf_block_get_page_zip(block);
+
+ if (!page_zip) {
+ /*something wrong*/
+ fprintf(stderr, "InnoDB: Something wrong with reading page %lu.\n", page_no);
+convert_err_exit:
+ mtr_commit(&mtr);
+ mutex_enter(&fil_system->mutex);
+ fil_space_free(space_id, FALSE);
+ mutex_exit(&fil_system->mutex);
+ success = FALSE;
+ goto convert_exit;
+ }
+
+ supremum = page_get_supremum_rec(page);
+ rec = page_rec_get_next(page_get_infimum_rec(page));
+ n_recs = page_get_n_recs(page);
+
+ /* illegal operation as InnoDB online system. so not logged */
+ while (rec && rec != supremum && n_recs > 0) {
+ ulint n_fields;
+ ulint i;
+ ulint offset = index->trx_id_offset;
+
+ offsets = rec_get_offsets(rec, index, offsets,
+ ULINT_UNDEFINED, &heap);
+ n_fields = rec_offs_n_fields(offsets);
+ if (!offset) {
+ offset = row_get_trx_id_offset(index, offsets);
+ }
+ trx_write_trx_id(rec + offset, ut_dulint_create(0, 1));
+
+ for (i = 0; i < n_fields; i++) {
+ if (rec_offs_nth_extern(offsets, i)) {
+ ulint local_len;
+ byte* data;
+
+ data = rec_get_nth_field(rec, offsets, i, &local_len);
+
+ local_len -= BTR_EXTERN_FIELD_REF_SIZE;
+
+ mach_write_to_4(data + local_len + BTR_EXTERN_SPACE_ID, id);
+ }
+ }
+
+ rec = page_rec_get_next(rec);
+ n_recs--;
+ }
+
+ /* dummy logged update for along with modified page path */
+ if (ut_dulint_cmp(index->id, btr_page_get_index_id(page)) != 0) {
+ /* this should be adjusted already */
+ fprintf(stderr, "InnoDB: The page %lu seems to be converted wrong.\n", page_no);
+ goto convert_err_exit;
+ }
+ btr_page_set_index_id(page, page_zip, index->id, &mtr);
+
+ /* confirm whether fits to the page size or not */
+ if (!page_zip_compress(page_zip, page, index, &mtr)
+ && !btr_page_reorganize(block, index, &mtr)) {
+ buf_block_t* new_block;
+ page_t* new_page;
+ page_zip_des_t* new_page_zip;
+ rec_t* split_rec;
+ ulint n_uniq;
+
+ /* split page is needed */
+ fprintf(stderr, " %lu", page_no);
+
+ mtr_x_lock(dict_index_get_lock(index), &mtr);
+
+ n_uniq = dict_index_get_n_unique_in_tree(index);
+
+ if(page_get_n_recs(page) < 2) {
+ /* no way to make smaller */
+ fprintf(stderr, "InnoDB: The page %lu cannot be store to the page size.\n", page_no);
+ goto convert_err_exit;
+ }
+
+ if (UNIV_UNLIKELY(page_no == dict_index_get_page(index))) {
+ ulint new_page_no;
+ dtuple_t* node_ptr;
+ ulint level;
+ rec_t* node_ptr_rec;
+ page_cur_t page_cursor;
+
+ /* it is root page, need to raise before split */
+
+ level = btr_page_get_level(page, &mtr);
+
+ new_block = btr_page_alloc(index, 0, FSP_NO_DIR, level, &mtr);
+ new_page = buf_block_get_frame(new_block);
+ new_page_zip = buf_block_get_page_zip(new_block);
+ btr_page_create(new_block, new_page_zip, index, level, &mtr);
+
+ btr_page_set_next(new_page, new_page_zip, FIL_NULL, &mtr);
+ btr_page_set_prev(new_page, new_page_zip, FIL_NULL, &mtr);
+
+ page_zip_copy_recs(new_page_zip, new_page,
+ page_zip, page, index, &mtr);
+ btr_search_move_or_delete_hash_entries(new_block, block, index);
+
+ rec = page_rec_get_next(page_get_infimum_rec(new_page));
+ new_page_no = buf_block_get_page_no(new_block);
+
+ node_ptr = dict_index_build_node_ptr(index, rec, new_page_no, heap,
+ level);
+ dtuple_set_info_bits(node_ptr,
+ dtuple_get_info_bits(node_ptr)
+ | REC_INFO_MIN_REC_FLAG);
+ btr_page_empty(block, page_zip, index, level + 1, &mtr);
+
+ btr_page_set_next(page, page_zip, FIL_NULL, &mtr);
+ btr_page_set_prev(page, page_zip, FIL_NULL, &mtr);
+
+ page_cur_set_before_first(block, &page_cursor);
+
+ node_ptr_rec = page_cur_tuple_insert(&page_cursor, node_ptr,
+ index, 0, &mtr);
+ ut_a(node_ptr_rec);
+
+ if (!btr_page_reorganize(block, index, &mtr)) {
+ fprintf(stderr, "InnoDB: failed to store the page %lu.\n", page_no);
+ goto convert_err_exit;
+ }
+
+ /* move to the raised page */
+ page_no = new_page_no;
+ block = new_block;
+ page = new_page;
+ page_zip = new_page_zip;
+
+ fprintf(stderr, "(raise_to:%lu)", page_no);
+ }
+
+ split_rec = page_get_middle_rec(page);
+
+ new_block = btr_page_alloc(index, page_no + 1, FSP_UP,
+ btr_page_get_level(page, &mtr), &mtr);
+ new_page = buf_block_get_frame(new_block);
+ new_page_zip = buf_block_get_page_zip(new_block);
+ btr_page_create(new_block, new_page_zip, index,
+ btr_page_get_level(page, &mtr), &mtr);
+
+ offsets = rec_get_offsets(split_rec, index, offsets, n_uniq, &heap);
+
+ btr_attach_half_pages(index, block,
+ split_rec, new_block, FSP_UP, &mtr);
+
+ page_zip_copy_recs(new_page_zip, new_page,
+ page_zip, page, index, &mtr);
+ page_delete_rec_list_start(split_rec - page + new_page,
+ new_block, index, &mtr);
+ btr_search_move_or_delete_hash_entries(new_block, block, index);
+ page_delete_rec_list_end(split_rec, block, index,
+ ULINT_UNDEFINED, ULINT_UNDEFINED, &mtr);
+
+ fprintf(stderr, "(new:%lu)", buf_block_get_page_no(new_block));
+
+ /* Are they needed? */
+ if (!btr_page_reorganize(block, index, &mtr)) {
+ fprintf(stderr, "InnoDB: failed to store the page %lu.\n", page_no);
+ goto convert_err_exit;
+ }
+ if (!btr_page_reorganize(new_block, index, &mtr)) {
+ fprintf(stderr, "InnoDB: failed to store the page %lu.\n", buf_block_get_page_no(new_block));
+ goto convert_err_exit;
+ }
+ }
+
+ page_no = btr_page_get_next(page, &mtr);
+
+ mtr_commit(&mtr);
+
+ if (heap) {
+ mem_heap_empty(heap);
+ }
+ }
+
+ fprintf(stderr, "...done.\nInnoDB: waiting the flush batch of the additional conversion.\n");
+
+ /* should wait for the not-logged changes are all flushed */
+ buf_flush_batch(BUF_FLUSH_LIST, ULINT_MAX, mtr.end_lsn + 1);
+ buf_flush_wait_batch_end(BUF_FLUSH_LIST);
+
+ fprintf(stderr, "InnoDB: done.\n");
+convert_exit:
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ }
+
return(success);
}
#endif /* !UNIV_HOTBACKUP */
@@ -4466,7 +4785,7 @@ fil_extend_space_to_desired_size(
mutex_exit(&fil_system->mutex);
mutex_exit(&fil_system->file_extend_mutex);
- fil_flush(space_id);
+ fil_flush(space_id, TRUE);
return(success);
}
@@ -4835,7 +5154,7 @@ _fil_io(
&& ((buf_page_t*)message)->space_was_being_deleted) {
if (mode == OS_AIO_NORMAL) {
- buf_page_io_complete(message, trx);
+ buf_page_io_complete(message);
return(DB_SUCCESS); /*fake*/
}
if (type == OS_FILE_READ) {
@@ -4946,14 +5265,14 @@ _fil_io(
ut_a(byte_offset % OS_FILE_LOG_BLOCK_SIZE == 0);
ut_a((len % OS_FILE_LOG_BLOCK_SIZE) == 0);
- if (srv_pass_corrupt_table && space->is_corrupt) {
+ if (srv_pass_corrupt_table == 1 && space->is_corrupt) {
/* should ignore i/o for the crashed space */
mutex_enter(&fil_system->mutex);
fil_node_complete_io(node, fil_system, type);
mutex_exit(&fil_system->mutex);
if (mode == OS_AIO_NORMAL) {
ut_a(space->purpose == FIL_TABLESPACE);
- buf_page_io_complete(message, trx);
+ buf_page_io_complete(message);
}
if (type == OS_FILE_READ) {
return(DB_TABLESPACE_DELETED);
@@ -4961,7 +5280,19 @@ _fil_io(
return(DB_SUCCESS);
}
} else {
- ut_a(!space->is_corrupt);
+ if (srv_pass_corrupt_table > 1 && space->is_corrupt) {
+ /* should ignore write i/o for the crashed space */
+ if (type == OS_FILE_WRITE) {
+ mutex_enter(&fil_system->mutex);
+ fil_node_complete_io(node, fil_system, type);
+ mutex_exit(&fil_system->mutex);
+ if (mode == OS_AIO_NORMAL) {
+ ut_a(space->purpose == FIL_TABLESPACE);
+ buf_page_io_complete(message);
+ }
+ return(DB_SUCCESS);
+ }
+ }
#ifdef UNIV_HOTBACKUP
/* In ibbackup do normal i/o, not aio */
if (type == OS_FILE_READ) {
@@ -5122,7 +5453,7 @@ fil_aio_wait(
|| buf_page_get_state(message) != BUF_BLOCK_FILE_PAGE);
srv_set_io_thread_op_info(segment, "complete io for buf page");
- buf_page_io_complete(message, NULL);
+ buf_page_io_complete(message);
return;
}
@@ -5146,7 +5477,7 @@ fil_aio_wait(
if (fil_node->space->purpose == FIL_TABLESPACE) {
srv_set_io_thread_op_info(segment, "complete io for buf page");
- buf_page_io_complete(message, NULL);
+ buf_page_io_complete(message);
} else {
srv_set_io_thread_op_info(segment, "complete io for log");
log_io_complete(message);
@@ -5161,8 +5492,9 @@ UNIV_INTERN
void
fil_flush(
/*======*/
- ulint space_id) /*!< in: file space id (this can be a group of
+ ulint space_id, /*!< in: file space id (this can be a group of
log files or a tablespace of the database) */
+ ibool metadata)
{
fil_space_t* space;
fil_node_t* node;
@@ -5233,7 +5565,7 @@ retry:
/* fprintf(stderr, "Flushing to file %s\n",
node->name); */
- os_file_flush(file);
+ os_file_flush(file, metadata);
mutex_enter(&fil_system->mutex);
@@ -5316,7 +5648,7 @@ fil_flush_file_spaces(
a non-existing space id. */
for (i = 0; i < n_space_ids; i++) {
- fil_flush(space_ids[i]);
+ fil_flush(space_ids[i], TRUE);
}
mem_free(space_ids);
diff --git a/storage/xtradb/ha/hash0hash.c b/storage/xtradb/ha/hash0hash.c
index 0f4fc55d895..30c304dafcd 100644
--- a/storage/xtradb/ha/hash0hash.c
+++ b/storage/xtradb/ha/hash0hash.c
@@ -128,70 +128,6 @@ hash_create(
}
/*************************************************************//**
-*/
-UNIV_INTERN
-ulint
-hash_create_needed(
-/*===============*/
- ulint n)
-{
- ulint prime;
- ulint offset;
-
- prime = ut_find_prime(n);
-
- offset = (sizeof(hash_table_t) + 7) / 8;
- offset *= 8;
-
- return(offset + sizeof(hash_cell_t) * prime);
-}
-
-UNIV_INTERN
-void
-hash_create_init(
-/*=============*/
- hash_table_t* table,
- ulint n)
-{
- ulint prime;
- ulint offset;
-
- prime = ut_find_prime(n);
-
- offset = (sizeof(hash_table_t) + 7) / 8;
- offset *= 8;
-
- table->array = (hash_cell_t*)(((byte*)table) + offset);
- table->n_cells = prime;
-# if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
- table->adaptive = FALSE;
-# endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
- table->n_mutexes = 0;
- table->mutexes = NULL;
- table->heaps = NULL;
- table->heap = NULL;
- ut_d(table->magic_n = HASH_TABLE_MAGIC_N);
-
- /* Initialize the cell array */
- hash_table_clear(table);
-}
-
-UNIV_INTERN
-void
-hash_create_reuse(
-/*==============*/
- hash_table_t* table)
-{
- ulint offset;
-
- offset = (sizeof(hash_table_t) + 7) / 8;
- offset *= 8;
-
- table->array = (hash_cell_t*)(((byte*)table) + offset);
- ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
-}
-
-/*************************************************************//**
Frees a hash table. */
UNIV_INTERN
void
diff --git a/storage/xtradb/handler/ha_innodb.cc b/storage/xtradb/handler/ha_innodb.cc
index 783508658a3..972a4407eea 100644
--- a/storage/xtradb/handler/ha_innodb.cc
+++ b/storage/xtradb/handler/ha_innodb.cc
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 2000, 2010, MySQL AB & Innobase Oy. All Rights Reserved.
+Copyright (c) 2000, 2011, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2008, 2009 Google Inc.
Copyright (c) 2009, Percona Inc.
@@ -198,11 +198,14 @@ static my_bool innobase_create_status_file = FALSE;
static my_bool innobase_stats_on_metadata = TRUE;
static my_bool innobase_use_sys_stats_table = FALSE;
static my_bool innobase_buffer_pool_shm_checksum = TRUE;
+static uint innobase_buffer_pool_shm_key = 0;
static char* internal_innobase_data_file_path = NULL;
static char* innodb_version_str = (char*) INNODB_VERSION_STR;
+static my_bool innobase_blocking_lru_restore = FALSE;
+
/** Possible values for system variable "innodb_stats_method". The values
are defined the same as its corresponding MyISAM system variable
"myisam_stats_method"(see "myisam_stats_method_names"), for better usability */
@@ -367,6 +370,12 @@ static MYSQL_THDVAR_ULONG(flush_log_at_trx_commit_session, PLUGIN_VAR_RQCMDARG,
"The value 3 regards innodb_flush_log_at_trx_commit (default).",
NULL, NULL, 3, 0, 3, 0);
+static MYSQL_THDVAR_BOOL(fake_changes, PLUGIN_VAR_OPCMDARG,
+ "In the transaction after enabled, UPDATE, INSERT and DELETE only move the cursor to the records "
+ "and do nothing other operations (no changes, no ibuf, no undo, no transaction log) in the transaction. "
+ "This is to cause replication prefetch IO. ATTENTION: the transaction started after enabled is affected.",
+ NULL, NULL, FALSE);
+
static handler *innobase_create_handler(handlerton *hton,
TABLE_SHARE *table,
@@ -547,6 +556,8 @@ static SHOW_VAR innodb_status_variables[]= {
(char*) &export_vars.innodb_buffer_pool_pages_misc, SHOW_LONG},
{"buffer_pool_pages_total",
(char*) &export_vars.innodb_buffer_pool_pages_total, SHOW_LONG},
+ {"buffer_pool_read_ahead_rnd",
+ (char*) &export_vars.innodb_buffer_pool_read_ahead_rnd, SHOW_LONG},
{"buffer_pool_read_ahead",
(char*) &export_vars.innodb_buffer_pool_read_ahead, SHOW_LONG},
{"buffer_pool_read_ahead_evicted",
@@ -1432,6 +1443,8 @@ innobase_trx_init(
trx->check_unique_secondary = !thd_test_options(
thd, OPTION_RELAXED_UNIQUE_CHECKS);
+ trx->fake_changes = THDVAR(thd, fake_changes);
+
#ifdef EXTENDED_SLOWLOG
if (thd_log_slow_verbosity(thd) & SLOG_V_INNODB) {
trx->take_stats = TRUE;
@@ -2504,6 +2517,12 @@ innobase_change_buffering_inited_ok:
srv_buf_pool_size = (ulint) innobase_buffer_pool_size;
+ if (innobase_buffer_pool_shm_key) {
+ fprintf(stderr,
+ "InnoDB: Warning: innodb_buffer_pool_shm_key is deprecated function.\n"
+ "InnoDB: innodb_buffer_pool_shm_key was ignored.\n");
+ }
+
srv_mem_pool_size = (ulint) innobase_additional_mem_pool_size;
srv_n_file_io_threads = (ulint) innobase_file_io_threads;
@@ -2520,7 +2539,8 @@ innobase_change_buffering_inited_ok:
srv_use_doublewrite_buf = (ibool) innobase_use_doublewrite;
srv_use_checksums = (ibool) innobase_use_checksums;
srv_fast_checksum = (ibool) innobase_fast_checksum;
- srv_buffer_pool_shm_checksum = (ibool) innobase_buffer_pool_shm_checksum;
+
+ srv_blocking_lru_restore = (ibool) innobase_blocking_lru_restore;
#ifdef HAVE_LARGE_PAGES
if ((os_use_large_pages = (ibool) my_use_large_pages))
@@ -2558,6 +2578,10 @@ innobase_change_buffering_inited_ok:
innobase_commit_concurrency_init_default();
+#ifndef EXTENDED_FOR_KILLIDLE
+ srv_kill_idle_transaction = 0;
+#endif
+
/* Since we in this module access directly the fields of a trx
struct, and due to different headers and flags it might happen that
mutex_t has a different size in this module and in InnoDB
@@ -2756,7 +2780,7 @@ innobase_commit_low(
#ifdef MYSQL_SERVER
THD *thd=current_thd;
- if (thd && thd->slave_thread) {
+ if (thd && thd_is_replication_slave_thread(thd)) {
/* Update the replication position info inside InnoDB.
In embedded server, does nothing. */
const char *log_file_name, *group_relay_log_name;
@@ -2958,6 +2982,11 @@ innobase_commit(
trx_search_latch_release_if_reserved(trx);
}
+ if (trx->fake_changes && (all || (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)))) {
+ innobase_rollback(hton, thd, all); /* rollback implicitly */
+ thd->main_da.reset_diagnostics_area(); /* because debug assertion code complains, if something left */
+ DBUG_RETURN(HA_ERR_WRONG_COMMAND);
+ }
/* The flag TRX_ACTIVE_IN_MYSQL in trx->active_flag is set in
1. ::external_lock(),
@@ -3903,7 +3932,7 @@ ha_innobase::open(
DBUG_RETURN(1);
}
- if (share->ib_table && share->ib_table->is_corrupt) {
+ if (srv_pass_corrupt_table <= 1 && share->ib_table && share->ib_table->is_corrupt) {
free_share(share);
DBUG_RETURN(HA_ERR_CRASHED_ON_USAGE);
@@ -3936,18 +3965,18 @@ retry:
/* Get pointer to a table object in InnoDB dictionary cache */
ib_table = dict_table_get(norm_name, TRUE);
- if (ib_table && ib_table->is_corrupt) {
+ if (srv_pass_corrupt_table <= 1 && ib_table && ib_table->is_corrupt) {
free_share(share);
my_free(upd_buff, MYF(0));
DBUG_RETURN(HA_ERR_CRASHED_ON_USAGE);
}
- if (share->ib_table) {
- ut_a(share->ib_table == ib_table);
- } else {
- share->ib_table = ib_table;
- }
+ share->ib_table = ib_table;
+
+
+
+
if (NULL == ib_table) {
if (is_part && retries < 10) {
@@ -4244,25 +4273,6 @@ field_in_record_is_null(
return(0);
}
-/**************************************************************//**
-Sets a field in a record to SQL NULL. Uses the record format
-information in table to track the null bit in record. */
-static inline
-void
-set_field_in_record_to_null(
-/*========================*/
- TABLE* table, /*!< in: MySQL table object */
- Field* field, /*!< in: MySQL field object */
- char* record) /*!< in: a row in MySQL format */
-{
- int null_offset;
-
- null_offset = (uint) ((char*) field->null_ptr
- - (char*) table->record[0]);
-
- record[null_offset] = record[null_offset] | field->null_bit;
-}
-
/*************************************************************//**
InnoDB uses this function to compare two data fields for which the data type
is such that we must use MySQL code to compare them. NOTE that the prototype
@@ -5672,14 +5682,15 @@ calc_row_difference(
/* The field has changed */
ufield = uvect->fields + n_changed;
+ UNIV_MEM_INVALID(ufield, sizeof *ufield);
/* Let us use a dummy dfield to make the conversion
from the MySQL column format to the InnoDB format */
- dict_col_copy_type(prebuilt->table->cols + innodb_idx,
- dfield_get_type(&dfield));
-
if (n_len != UNIV_SQL_NULL) {
+ dict_col_copy_type(prebuilt->table->cols + innodb_idx,
+ dfield_get_type(&dfield));
+
buf = row_mysql_store_col_in_innobase_format(
&dfield,
(byte*)buf,
@@ -5687,7 +5698,7 @@ calc_row_difference(
new_mysql_row_col,
col_pack_len,
dict_table_is_comp(prebuilt->table));
- dfield_copy_data(&ufield->new_val, &dfield);
+ dfield_copy(&ufield->new_val, &dfield);
} else {
dfield_set_null(&ufield->new_val);
}
@@ -6128,7 +6139,7 @@ ha_innobase::index_read(
ha_statistic_increment(&SSV::ha_read_key_count);
- if (share->ib_table->is_corrupt) {
+ if (srv_pass_corrupt_table <= 1 && share->ib_table->is_corrupt) {
DBUG_RETURN(HA_ERR_CRASHED);
}
@@ -6197,7 +6208,7 @@ ha_innobase::index_read(
ret = DB_UNSUPPORTED;
}
- if (share->ib_table->is_corrupt) {
+ if (srv_pass_corrupt_table <= 1 && share->ib_table->is_corrupt) {
DBUG_RETURN(HA_ERR_CRASHED);
}
@@ -6316,7 +6327,7 @@ ha_innobase::change_active_index(
{
DBUG_ENTER("change_active_index");
- if (share->ib_table->is_corrupt) {
+ if (srv_pass_corrupt_table <= 1 && share->ib_table->is_corrupt) {
DBUG_RETURN(HA_ERR_CRASHED);
}
@@ -6411,7 +6422,7 @@ ha_innobase::general_fetch(
DBUG_ENTER("general_fetch");
- if (share->ib_table->is_corrupt) {
+ if (srv_pass_corrupt_table <= 1 && share->ib_table->is_corrupt) {
DBUG_RETURN(HA_ERR_CRASHED);
}
@@ -6424,7 +6435,7 @@ ha_innobase::general_fetch(
innodb_srv_conc_exit_innodb(prebuilt->trx);
- if (share->ib_table->is_corrupt) {
+ if (srv_pass_corrupt_table <= 1 && share->ib_table->is_corrupt) {
DBUG_RETURN(HA_ERR_CRASHED);
}
@@ -6765,10 +6776,6 @@ create_table_def(
DBUG_PRINT("enter", ("table_name: %s", table_name));
ut_a(trx->mysql_thd != NULL);
- if (IS_MAGIC_TABLE_AND_USER_DENIED_ACCESS(table_name,
- (THD*) trx->mysql_thd)) {
- DBUG_RETURN(HA_ERR_GENERIC);
- }
/* MySQL does the name length check. But we do additional check
on the name length here */
@@ -6890,6 +6897,8 @@ err_col:
col_len);
}
+ srv_lower_case_table_names = lower_case_table_names;
+
error = row_create_table_for_mysql(table, trx);
if (error == DB_DUPLICATE_KEY) {
@@ -7306,42 +7315,17 @@ ha_innobase::create(
DBUG_RETURN(HA_ERR_TO_BIG_ROW);
}
- /* Get the transaction associated with the current thd, or create one
- if not yet created */
-
- parent_trx = check_trx_exists(thd);
-
- /* In case MySQL calls this in the middle of a SELECT query, release
- possible adaptive hash latch to avoid deadlocks of threads */
-
- trx_search_latch_release_if_reserved(parent_trx);
-
- trx = innobase_trx_allocate(thd);
-
- if (lower_case_table_names) {
- srv_lower_case_table_names = TRUE;
- } else {
- srv_lower_case_table_names = FALSE;
- }
-
strcpy(name2, name);
normalize_table_name(norm_name, name2);
- /* Latch the InnoDB data dictionary exclusively so that no deadlocks
- or lock waits can happen in it during a table create operation.
- Drop table etc. do this latching in row0mysql.c. */
-
- row_mysql_lock_data_dictionary(trx);
-
/* Create the table definition in InnoDB */
flags = 0;
/* Validate create options if innodb_strict_mode is set. */
if (!create_options_are_valid(thd, form, create_info)) {
- error = ER_ILLEGAL_HA_CREATE_OPTION;
- goto cleanup;
+ DBUG_RETURN(ER_ILLEGAL_HA_CREATE_OPTION);
}
if (create_info->key_block_size) {
@@ -7483,16 +7467,43 @@ ha_innobase::create(
/* Check for name conflicts (with reserved name) for
any user indices to be created. */
- if (innobase_index_name_is_reserved(trx, form->key_info,
+ if (innobase_index_name_is_reserved(thd, form->key_info,
form->s->keys)) {
- error = -1;
- goto cleanup;
+ DBUG_RETURN(-1);
+ }
+
+ if (IS_MAGIC_TABLE_AND_USER_DENIED_ACCESS(norm_name, thd)) {
+ DBUG_RETURN(HA_ERR_GENERIC);
}
if (create_info->options & HA_LEX_CREATE_TMP_TABLE) {
flags |= DICT_TF2_TEMPORARY << DICT_TF2_SHIFT;
}
+ /* Get the transaction associated with the current thd, or create one
+ if not yet created */
+
+ parent_trx = check_trx_exists(thd);
+
+ /* In case MySQL calls this in the middle of a SELECT query, release
+ possible adaptive hash latch to avoid deadlocks of threads */
+
+ trx_search_latch_release_if_reserved(parent_trx);
+
+ trx = innobase_trx_allocate(thd);
+
+ if (trx->fake_changes) {
+ innobase_commit_low(trx);
+ trx_free_for_mysql(trx);
+ DBUG_RETURN(HA_ERR_WRONG_COMMAND);
+ }
+
+ /* Latch the InnoDB data dictionary exclusively so that no deadlocks
+ or lock waits can happen in it during a table create operation.
+ Drop table etc. do this latching in row0mysql.c. */
+
+ row_mysql_lock_data_dictionary(trx);
+
error = create_table_def(trx, form, norm_name,
create_info->options & HA_LEX_CREATE_TMP_TABLE ? name2 : NULL,
flags);
@@ -7692,6 +7703,10 @@ ha_innobase::delete_all_rows(void)
DBUG_RETURN(HA_ERR_CRASHED);
}
+ if (prebuilt->trx->fake_changes) {
+ goto fallback;
+ }
+
/* Truncate the table in InnoDB */
error = row_truncate_table_for_mysql(prebuilt->table, prebuilt->trx);
@@ -7752,10 +7767,10 @@ ha_innobase::delete_table(
trx = innobase_trx_allocate(thd);
- if (lower_case_table_names) {
- srv_lower_case_table_names = TRUE;
- } else {
- srv_lower_case_table_names = FALSE;
+ if (trx->fake_changes) {
+ innobase_commit_low(trx);
+ trx_free_for_mysql(trx);
+ DBUG_RETURN(HA_ERR_WRONG_COMMAND);
}
name_len = strlen(name);
@@ -7764,6 +7779,8 @@ ha_innobase::delete_table(
/* Drop the table in InnoDB */
+ srv_lower_case_table_names = lower_case_table_names;
+
error = row_drop_table_for_mysql(norm_name, trx,
thd_sql_command(thd)
== SQLCOM_DROP_DB);
@@ -7844,6 +7861,12 @@ innobase_drop_database(
trx->mysql_thd = NULL;
#else
trx = innobase_trx_allocate(thd);
+ if (trx->fake_changes) {
+ my_free(namebuf, MYF(0));
+ innobase_commit_low(trx);
+ trx_free_for_mysql(trx);
+ return; /* ignore */
+ }
#endif
row_drop_database_for_mysql(namebuf, trx);
my_free(namebuf, MYF(0));
@@ -7880,12 +7903,6 @@ innobase_rename_table(
char* norm_from;
DBUG_ENTER("innobase_rename_table");
- if (lower_case_table_names) {
- srv_lower_case_table_names = TRUE;
- } else {
- srv_lower_case_table_names = FALSE;
- }
-
// Magic number 64 arbitrary
norm_to = (char*) my_malloc(strlen(to) + 64, MYF(0));
norm_from = (char*) my_malloc(strlen(from) + 64, MYF(0));
@@ -7900,6 +7917,8 @@ innobase_rename_table(
row_mysql_lock_data_dictionary(trx);
}
+ srv_lower_case_table_names = lower_case_table_names;
+
error = row_rename_table_for_mysql(
norm_from, norm_to, trx, lock_and_commit);
@@ -7957,6 +7976,11 @@ ha_innobase::rename_table(
trx_search_latch_release_if_reserved(parent_trx);
trx = innobase_trx_allocate(thd);
+ if (trx->fake_changes) {
+ innobase_commit_low(trx);
+ trx_free_for_mysql(trx);
+ DBUG_RETURN(HA_ERR_WRONG_COMMAND);
+ }
error = innobase_rename_table(trx, from, to, TRUE);
@@ -8460,6 +8484,8 @@ ha_innobase::info_low(
if (flag & HA_STATUS_VARIABLE) {
+ ulint page_size;
+
dict_table_stats_lock(ib_table, RW_S_LATCH);
n_rows = ib_table->stat_n_rows;
@@ -8502,14 +8528,19 @@ ha_innobase::info_low(
prebuilt->autoinc_last_value = 0;
}
+ page_size = dict_table_zip_size(ib_table);
+ if (page_size == 0) {
+ page_size = UNIV_PAGE_SIZE;
+ }
+
stats.records = (ha_rows)n_rows;
stats.deleted = 0;
- stats.data_file_length = ((ulonglong)
- ib_table->stat_clustered_index_size)
- * UNIV_PAGE_SIZE;
- stats.index_file_length = ((ulonglong)
- ib_table->stat_sum_of_other_index_sizes)
- * UNIV_PAGE_SIZE;
+ stats.data_file_length
+ = ((ulonglong) ib_table->stat_clustered_index_size)
+ * page_size;
+ stats.index_file_length =
+ ((ulonglong) ib_table->stat_sum_of_other_index_sizes)
+ * page_size;
dict_table_stats_unlock(ib_table, RW_S_LATCH);
@@ -9466,10 +9497,18 @@ ha_innobase::external_lock(
reset_template();
- if (lock_type == F_WRLCK) {
+ if (lock_type == F_WRLCK
+ || (table->s->tmp_table
+ && thd_sql_command(thd) == SQLCOM_LOCK_TABLES)) {
/* If this is a SELECT, then it is in UPDATE TABLE ...
- or SELECT ... FOR UPDATE */
+ or SELECT ... FOR UPDATE
+
+ For temporary tables which are locked for READ by LOCK TABLES
+ updates are still allowed by SQL-layer. In order to accomodate
+ for such a situation we always request X-lock for such table
+ at LOCK TABLES time.
+ */
prebuilt->select_lock_type = LOCK_X;
prebuilt->stored_select_lock_type = LOCK_X;
}
@@ -10716,6 +10755,10 @@ innobase_xa_prepare(
return(0);
}
+ if (trx->fake_changes) {
+ return(0);
+ }
+
thd_get_xid(thd, (MYSQL_XID*) &trx->xid);
/* Release a possible FIFO ticket and search latch. Since we will
@@ -10804,7 +10847,7 @@ innobase_commit_by_xid(
if (trx) {
innobase_commit_low(trx);
-
+ trx_free_for_background(trx);
return(XA_OK);
} else {
return(XAER_NOTA);
@@ -10830,7 +10873,9 @@ innobase_rollback_by_xid(
trx = trx_get_trx_by_xid(xid);
if (trx) {
- return(innobase_rollback_trx(trx));
+ int ret = innobase_rollback_trx(trx);
+ trx_free_for_background(trx);
+ return(ret);
} else {
return(XAER_NOTA);
}
@@ -11058,7 +11103,7 @@ ha_innobase::check_if_incompatible_data(
if (info_row_type == ROW_TYPE_DEFAULT)
info_row_type = ROW_TYPE_COMPACT;
if ((info->used_fields & HA_CREATE_USED_ROW_FORMAT) &&
- get_row_type() != ((info->row_type == ROW_TYPE_DEFAULT)
+ row_type != ((info->row_type == ROW_TYPE_DEFAULT)
? ROW_TYPE_COMPACT : info->row_type)) {
DBUG_PRINT("info", ("get_row_type()=%d != info->row_type=%d -> "
@@ -11522,19 +11567,19 @@ static int show_innodb_vars(THD *thd, SHOW_VAR *var, char *buff)
return 0;
}
-/***********************************************************************
+/*********************************************************************//**
This function checks each index name for a table against reserved
-system default primary index name 'GEN_CLUST_INDEX'. If a name matches,
-this function pushes an warning message to the client, and returns true. */
+system default primary index name 'GEN_CLUST_INDEX'. If a name
+matches, this function pushes an warning message to the client,
+and returns true.
+@return true if the index name matches the reserved name */
extern "C" UNIV_INTERN
bool
innobase_index_name_is_reserved(
/*============================*/
- /* out: true if an index name
- matches the reserved name */
- const trx_t* trx, /* in: InnoDB transaction handle */
- const KEY* key_info, /* in: Indexes to be created */
- ulint num_of_keys) /* in: Number of indexes to
+ THD* thd, /*!< in/out: MySQL connection */
+ const KEY* key_info, /*!< in: Indexes to be created */
+ ulint num_of_keys) /*!< in: Number of indexes to
be created. */
{
const KEY* key;
@@ -11546,7 +11591,7 @@ innobase_index_name_is_reserved(
if (innobase_strcasecmp(key->name,
innobase_index_reserve_name) == 0) {
/* Push warning to mysql */
- push_warning_printf((THD*) trx->mysql_thd,
+ push_warning_printf(thd,
MYSQL_ERROR::WARN_LEVEL_WARN,
ER_WRONG_NAME_FOR_INDEX,
"Cannot Create Index with name "
@@ -11565,6 +11610,48 @@ innobase_index_name_is_reserved(
return(false);
}
+/***********************************************************************
+functions for kill session of idle transaction */
+extern "C"
+ibool
+innobase_thd_is_idle(
+/*=================*/
+ const void* thd) /*!< in: thread handle (THD*) */
+{
+#ifdef EXTENDED_FOR_KILLIDLE
+ return(thd_command((const THD*) thd) == COM_SLEEP);
+#else
+ return(FALSE);
+#endif
+}
+
+extern "C"
+ib_int64_t
+innobase_thd_get_start_time(
+/*========================*/
+ const void* thd) /*!< in: thread handle (THD*) */
+{
+#ifdef EXTENDED_FOR_KILLIDLE
+ return((ib_int64_t)thd_start_time((const THD*) thd));
+#else
+ return(0); /*dummy value*/
+#endif
+}
+
+extern "C"
+void
+innobase_thd_kill(
+/*==============*/
+ void* thd)
+{
+#ifdef EXTENDED_FOR_KILLIDLE
+ thd_kill((THD*) thd);
+#else
+ return;
+#endif
+}
+
+
static SHOW_VAR innodb_status_variables_export[]= {
{"Innodb", (char*) &show_innodb_vars, SHOW_FUNC},
{NullS, NullS, SHOW_LONG}
@@ -11624,7 +11711,7 @@ static MYSQL_SYSVAR_BOOL(recovery_stats, innobase_recovery_stats,
static MYSQL_SYSVAR_ULINT(use_purge_thread, srv_use_purge_thread,
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
"Number of purge devoted threads. #### over 1 is EXPERIMENTAL ####",
- NULL, NULL, 1, 0, 64, 0);
+ NULL, NULL, 1, 0, UNIV_MAX_PARALLELISM, 0);
static MYSQL_SYSVAR_BOOL(overwrite_relay_log_info, innobase_overwrite_relay_log_info,
PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
@@ -11796,16 +11883,16 @@ static MYSQL_SYSVAR_ULONG(autoextend_increment, srv_auto_extend_increment,
static MYSQL_SYSVAR_LONGLONG(buffer_pool_size, innobase_buffer_pool_size,
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
"The size of the memory buffer InnoDB uses to cache data and indexes of its tables.",
- NULL, NULL, 128*1024*1024L, 32*1024*1024L, LONGLONG_MAX, 1024*1024L);
+ NULL, NULL, 128*1024*1024L, 5*1024*1024L, LONGLONG_MAX, 1024*1024L);
-static MYSQL_SYSVAR_UINT(buffer_pool_shm_key, srv_buffer_pool_shm_key,
+static MYSQL_SYSVAR_UINT(buffer_pool_shm_key, innobase_buffer_pool_shm_key,
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
- "[experimental] The key value of shared memory segment for the buffer pool. 0 (default) disables the feature.",
+ "[Deprecated option] no effect",
NULL, NULL, 0, 0, INT_MAX32, 0);
static MYSQL_SYSVAR_BOOL(buffer_pool_shm_checksum, innobase_buffer_pool_shm_checksum,
PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
- "Enable buffer_pool_shm checksum validation (enabled by default).",
+ "[Deprecated option] no effect",
NULL, NULL, TRUE);
static MYSQL_SYSVAR_ULONG(commit_concurrency, innobase_commit_concurrency,
@@ -11818,6 +11905,14 @@ static MYSQL_SYSVAR_ULONG(concurrency_tickets, srv_n_free_tickets_to_enter,
"Number of times a thread is allowed to enter InnoDB within the same SQL query after it has once got the ticket",
NULL, NULL, 500L, 1L, ~0L, 0);
+#ifdef EXTENDED_FOR_KILLIDLE
+#define kill_idle_help_text "If non-zero value, the idle session with transaction which is idle over the value in seconds is killed by InnoDB."
+#else
+#define kill_idle_help_text "No effect for this build."
+#endif
+static MYSQL_SYSVAR_LONGLONG(kill_idle_transaction, srv_kill_idle_transaction,
+ PLUGIN_VAR_RQCMDARG, kill_idle_help_text, NULL, NULL, 0, 0, LONG_MAX, 0);
+
static MYSQL_SYSVAR_LONG(file_io_threads, innobase_file_io_threads,
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY | PLUGIN_VAR_NOSYSVAR,
"Number of file I/O threads in InnoDB.",
@@ -11953,6 +12048,11 @@ static MYSQL_SYSVAR_UINT(change_buffering_debug, ibuf_debug,
NULL, NULL, 0, 0, 1, 0);
#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
+static MYSQL_SYSVAR_BOOL(random_read_ahead, srv_random_read_ahead,
+ PLUGIN_VAR_NOCMDARG,
+ "Whether to use read ahead for random access within an extent.",
+ NULL, NULL, FALSE);
+
static MYSQL_SYSVAR_ULONG(read_ahead_threshold, srv_read_ahead_threshold,
PLUGIN_VAR_RQCMDARG,
"Number of pages that must be accessed sequentially for InnoDB to "
@@ -12077,13 +12177,19 @@ static MYSQL_SYSVAR_UINT(auto_lru_dump, srv_auto_lru_dump,
"0 (the default) disables automatic dumps.",
NULL, NULL, 0, 0, UINT_MAX32, 0);
+static MYSQL_SYSVAR_BOOL(blocking_lru_restore, innobase_blocking_lru_restore,
+ PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
+ "Block XtraDB startup process until buffer pool is full restored from a "
+ "dump file (if present). Disabled by default.",
+ NULL, NULL, FALSE);
+
static MYSQL_SYSVAR_ULINT(pass_corrupt_table, srv_pass_corrupt_table,
PLUGIN_VAR_RQCMDARG,
"Pass corruptions of user tables as 'corrupt table' instead of not crashing itself, "
"when used with file_per_table. "
"All file io for the datafile after detected as corrupt are disabled, "
"except for the deletion.",
- NULL, NULL, 0, 0, 1, 0);
+ NULL, NULL, 0, 0, 2, 0);
static MYSQL_SYSVAR_ULINT(lazy_drop_table, srv_lazy_drop_table,
PLUGIN_VAR_RQCMDARG,
@@ -12103,6 +12209,7 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
MYSQL_SYSVAR(fast_checksum),
MYSQL_SYSVAR(commit_concurrency),
MYSQL_SYSVAR(concurrency_tickets),
+ MYSQL_SYSVAR(kill_idle_transaction),
MYSQL_SYSVAR(data_file_path),
MYSQL_SYSVAR(doublewrite_file),
MYSQL_SYSVAR(data_home_dir),
@@ -12177,12 +12284,15 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
MYSQL_SYSVAR(change_buffering_debug),
#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
+ MYSQL_SYSVAR(random_read_ahead),
MYSQL_SYSVAR(read_ahead_threshold),
MYSQL_SYSVAR(io_capacity),
MYSQL_SYSVAR(auto_lru_dump),
+ MYSQL_SYSVAR(blocking_lru_restore),
MYSQL_SYSVAR(use_purge_thread),
MYSQL_SYSVAR(pass_corrupt_table),
MYSQL_SYSVAR(lazy_drop_table),
+ MYSQL_SYSVAR(fake_changes),
NULL
};
diff --git a/storage/xtradb/handler/ha_innodb.h b/storage/xtradb/handler/ha_innodb.h
index 7b263db2537..749438e0c89 100644
--- a/storage/xtradb/handler/ha_innodb.h
+++ b/storage/xtradb/handler/ha_innodb.h
@@ -390,15 +390,14 @@ innobase_trx_allocate(
This function checks each index name for a table against reserved
system default primary index name 'GEN_CLUST_INDEX'. If a name
matches, this function pushes an warning message to the client,
-and returns true. */
+and returns true.
+@return true if the index name matches the reserved name */
extern "C"
bool
innobase_index_name_is_reserved(
/*============================*/
- /* out: true if the index name
- matches the reserved name */
- const trx_t* trx, /* in: InnoDB transaction handle */
- const KEY* key_info, /* in: Indexes to be created */
- ulint num_of_keys); /* in: Number of indexes to
+ THD* thd, /*!< in/out: MySQL connection */
+ const KEY* key_info, /*!< in: Indexes to be created */
+ ulint num_of_keys); /*!< in: Number of indexes to
be created. */
diff --git a/storage/xtradb/handler/handler0alter.cc b/storage/xtradb/handler/handler0alter.cc
index c54b4a17fd2..37fddf71cbc 100644
--- a/storage/xtradb/handler/handler0alter.cc
+++ b/storage/xtradb/handler/handler0alter.cc
@@ -649,44 +649,47 @@ ha_innobase::add_index(
update_thd();
- heap = mem_heap_create(1024);
-
/* In case MySQL calls this in the middle of a SELECT query, release
possible adaptive hash latch to avoid deadlocks of threads. */
trx_search_latch_release_if_reserved(prebuilt->trx);
- trx_start_if_not_started(prebuilt->trx);
+ if (prebuilt->trx->fake_changes) {
+ DBUG_RETURN(HA_ERR_WRONG_COMMAND);
+ }
- /* Create a background transaction for the operations on
- the data dictionary tables. */
- trx = innobase_trx_allocate(user_thd);
- trx_start_if_not_started(trx);
+ /* Check if the index name is reserved. */
+ if (innobase_index_name_is_reserved(user_thd, key_info, num_of_keys)) {
+ DBUG_RETURN(-1);
+ }
innodb_table = indexed_table
= dict_table_get(prebuilt->table->name, FALSE);
if (UNIV_UNLIKELY(!innodb_table)) {
- error = HA_ERR_NO_SUCH_TABLE;
- goto err_exit;
+ DBUG_RETURN(HA_ERR_NO_SUCH_TABLE);
}
- /* Check if the index name is reserved. */
- if (innobase_index_name_is_reserved(trx, key_info, num_of_keys)) {
- error = ER_WRONG_NAME_FOR_INDEX;
- } else {
- /* Check that index keys are sensible */
- error = innobase_check_index_keys(key_info, num_of_keys,
- innodb_table);
- }
+ /* Check that index keys are sensible */
+ error = innobase_check_index_keys(key_info, num_of_keys, innodb_table);
if (UNIV_UNLIKELY(error)) {
-err_exit:
+ DBUG_RETURN(error);
+ }
+
+ heap = mem_heap_create(1024);
+ trx_start_if_not_started(prebuilt->trx);
+
+ /* Create a background transaction for the operations on
+ the data dictionary tables. */
+ trx = innobase_trx_allocate(user_thd);
+ if (trx->fake_changes) {
mem_heap_free(heap);
trx_general_rollback_for_mysql(trx, NULL);
trx_free_for_mysql(trx);
- trx_commit_for_mysql(prebuilt->trx);
- DBUG_RETURN(error);
+ DBUG_RETURN(HA_ERR_WRONG_COMMAND);
}
+ trx_start_if_not_started(trx);
+
/* Create table containing all indexes to be built in this
alter table add index so that they are in the correct order
in the table. */
@@ -758,8 +761,12 @@ err_exit:
ut_d(dict_table_check_for_dup_indexes(innodb_table,
FALSE));
+ mem_heap_free(heap);
+ trx_general_rollback_for_mysql(trx, NULL);
row_mysql_unlock_data_dictionary(trx);
- goto err_exit;
+ trx_free_for_mysql(trx);
+ trx_commit_for_mysql(prebuilt->trx);
+ DBUG_RETURN(error);
}
trx->table_id = indexed_table->id;
@@ -782,10 +789,6 @@ err_exit:
ut_ad(error == DB_SUCCESS);
- /* We will need to rebuild index translation table. Set
- valid index entry count in the translation table to zero */
- share->idx_trans_tbl.index_count = 0;
-
/* Commit the data dictionary transaction in order to release
the table locks on the system tables. This means that if
MySQL crashes while creating a new primary key inside
@@ -911,6 +914,14 @@ error:
}
convert_error:
+ if (error == DB_SUCCESS) {
+ /* Build index is successful. We will need to
+ rebuild index translation table. Reset the
+ index entry count in the translation table
+ to zero, so that translation table will be rebuilt */
+ share->idx_trans_tbl.index_count = 0;
+ }
+
error = convert_error_code_to_mysql(error,
innodb_table->flags,
user_thd);
@@ -963,6 +974,10 @@ ha_innobase::prepare_drop_index(
trx_search_latch_release_if_reserved(prebuilt->trx);
trx = prebuilt->trx;
+ if (trx->fake_changes) {
+ DBUG_RETURN(HA_ERR_WRONG_COMMAND);
+ }
+
/* Test and mark all the indexes to be dropped */
row_mysql_lock_data_dictionary(trx);
@@ -1167,6 +1182,12 @@ ha_innobase::final_drop_index(
/* Create a background transaction for the operations on
the data dictionary tables. */
trx = innobase_trx_allocate(user_thd);
+ if (trx->fake_changes) {
+ trx_general_rollback_for_mysql(trx, NULL);
+ trx_free_for_mysql(trx);
+ DBUG_RETURN(HA_ERR_WRONG_COMMAND);
+ }
+
trx_start_if_not_started(trx);
/* Flag this transaction as a dictionary operation, so that
diff --git a/storage/xtradb/handler/i_s.cc b/storage/xtradb/handler/i_s.cc
index d989ce87aa3..15c09d69830 100644
--- a/storage/xtradb/handler/i_s.cc
+++ b/storage/xtradb/handler/i_s.cc
@@ -1006,7 +1006,7 @@ UNIV_INTERN struct st_mysql_plugin i_s_innodb_buffer_pool_pages =
/* plugin author (for SHOW PLUGINS) */
/* const char* */
- STRUCT_FLD(author, plugin_author),
+ STRUCT_FLD(author, "Percona"),
/* general descriptive text (for SHOW PLUGINS) */
/* const char* */
@@ -1055,7 +1055,7 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_buffer_pool_pages_maria =
/* plugin author (for SHOW PLUGINS) */
/* const char* */
- STRUCT_FLD(author, plugin_author),
+ STRUCT_FLD(author, "Percona"),
/* general descriptive text (for SHOW PLUGINS) */
/* const char* */
@@ -1108,7 +1108,7 @@ UNIV_INTERN struct st_mysql_plugin i_s_innodb_buffer_pool_pages_index =
/* plugin author (for SHOW PLUGINS) */
/* const char* */
- STRUCT_FLD(author, plugin_author),
+ STRUCT_FLD(author, "Percona"),
/* general descriptive text (for SHOW PLUGINS) */
/* const char* */
@@ -1157,7 +1157,7 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_buffer_pool_pages_index_maria =
/* plugin author (for SHOW PLUGINS) */
/* const char* */
- STRUCT_FLD(author, plugin_author),
+ STRUCT_FLD(author, "Percona"),
/* general descriptive text (for SHOW PLUGINS) */
/* const char* */
@@ -1210,7 +1210,7 @@ UNIV_INTERN struct st_mysql_plugin i_s_innodb_buffer_pool_pages_blob =
/* plugin author (for SHOW PLUGINS) */
/* const char* */
- STRUCT_FLD(author, plugin_author),
+ STRUCT_FLD(author, "Percona"),
/* general descriptive text (for SHOW PLUGINS) */
/* const char* */
@@ -1259,7 +1259,7 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_buffer_pool_pages_blob_maria =
/* plugin author (for SHOW PLUGINS) */
/* const char* */
- STRUCT_FLD(author, plugin_author),
+ STRUCT_FLD(author, "Percona"),
/* general descriptive text (for SHOW PLUGINS) */
/* const char* */
@@ -3158,7 +3158,7 @@ UNIV_INTERN struct st_mysql_plugin i_s_innodb_rseg =
/* plugin author (for SHOW PLUGINS) */
/* const char* */
- STRUCT_FLD(author, plugin_author),
+ STRUCT_FLD(author, "Percona"),
/* general descriptive text (for SHOW PLUGINS) */
/* const char* */
@@ -3207,7 +3207,7 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_rseg_maria =
/* plugin author (for SHOW PLUGINS) */
/* const char* */
- STRUCT_FLD(author, plugin_author),
+ STRUCT_FLD(author, "Percona"),
/* general descriptive text (for SHOW PLUGINS) */
/* const char* */
@@ -3246,6 +3246,189 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_rseg_maria =
/***********************************************************************
*/
+static ST_FIELD_INFO i_s_innodb_admin_command_info[] =
+{
+ {STRUCT_FLD(field_name, "result_message"),
+ STRUCT_FLD(field_length, 1024),
+ STRUCT_FLD(field_type, MYSQL_TYPE_STRING),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, 0),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ END_OF_ST_FIELD_INFO
+};
+
+#ifndef INNODB_COMPATIBILITY_HOOKS
+#error InnoDB needs MySQL to be built with #define INNODB_COMPATIBILITY_HOOKS
+#endif
+
+extern "C" {
+char **thd_query(MYSQL_THD thd);
+}
+
+static
+int
+i_s_innodb_admin_command_fill(
+/*==========================*/
+ THD* thd,
+ TABLE_LIST* tables,
+ COND* cond)
+{
+ TABLE* i_s_table = (TABLE *) tables->table;
+ char** query_str;
+ char* ptr;
+ char quote = '\0';
+ const char* command_head = "XTRA_";
+
+ DBUG_ENTER("i_s_innodb_admin_command_fill");
+
+ /* deny access to non-superusers */
+ if (check_global_access(thd, PROCESS_ACL)) {
+ DBUG_RETURN(0);
+ }
+
+ if(thd_sql_command(thd) != SQLCOM_SELECT) {
+ field_store_string(i_s_table->field[0],
+ "SELECT command is only accepted.");
+ goto end_func;
+ }
+
+ query_str = thd_query(thd);
+ ptr = *query_str;
+
+ for (; *ptr; ptr++) {
+ if (*ptr == quote) {
+ quote = '\0';
+ } else if (quote) {
+ } else if (*ptr == '`' || *ptr == '"') {
+ quote = *ptr;
+ } else {
+ long i;
+ for (i = 0; command_head[i]; i++) {
+ if (toupper((int)(unsigned char)(ptr[i]))
+ != toupper((int)(unsigned char)
+ (command_head[i]))) {
+ goto nomatch;
+ }
+ }
+ break;
+nomatch:
+ ;
+ }
+ }
+
+ if (!*ptr) {
+ field_store_string(i_s_table->field[0],
+ "No XTRA_* command in the SQL statement."
+ " Please add /*!XTRA_xxxx*/ to the SQL.");
+ goto end_func;
+ }
+
+ if (!strncasecmp("XTRA_HELLO", ptr, 10)) {
+ /* This is example command XTRA_HELLO */
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr, " InnoDB: administration command test for XtraDB"
+ " 'XTRA_HELLO' was detected.\n");
+
+ field_store_string(i_s_table->field[0],
+ "Hello!");
+ goto end_func;
+ }
+ else if (!strncasecmp("XTRA_LRU_DUMP", ptr, 13)) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr, " InnoDB: administration command 'XTRA_LRU_DUMP'"
+ " was detected.\n");
+
+ if (buf_LRU_file_dump()) {
+ field_store_string(i_s_table->field[0],
+ "XTRA_LRU_DUMP was succeeded.");
+ } else {
+ field_store_string(i_s_table->field[0],
+ "XTRA_LRU_DUMP was failed.");
+ }
+
+ goto end_func;
+ }
+ else if (!strncasecmp("XTRA_LRU_RESTORE", ptr, 16)) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr, " InnoDB: administration command 'XTRA_LRU_RESTORE'"
+ " was detected.\n");
+
+ if (buf_LRU_file_restore()) {
+ field_store_string(i_s_table->field[0],
+ "XTRA_LRU_RESTORE was succeeded.");
+ } else {
+ field_store_string(i_s_table->field[0],
+ "XTRA_LRU_RESTORE was failed.");
+ }
+
+ goto end_func;
+ }
+
+ field_store_string(i_s_table->field[0],
+ "Undefined XTRA_* command.");
+ goto end_func;
+
+end_func:
+ if (schema_table_store_record(thd, i_s_table)) {
+ DBUG_RETURN(1);
+ } else {
+ DBUG_RETURN(0);
+ }
+}
+
+static
+int
+i_s_innodb_admin_command_init(
+/*==========================*/
+ void* p)
+{
+ DBUG_ENTER("i_s_innodb_admin_command_init");
+ ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+ schema->fields_info = i_s_innodb_admin_command_info;
+ schema->fill_table = i_s_innodb_admin_command_fill;
+
+ DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_mysql_plugin i_s_innodb_admin_command =
+{
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+ STRUCT_FLD(info, &i_s_info),
+ STRUCT_FLD(name, "XTRADB_ADMIN_COMMAND"),
+ STRUCT_FLD(author, "Percona"),
+ STRUCT_FLD(descr, "XtraDB specific command acceptor"),
+ STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+ STRUCT_FLD(init, i_s_innodb_admin_command_init),
+ STRUCT_FLD(deinit, i_s_common_deinit),
+ STRUCT_FLD(version, 0x0100 /* 1.0 */),
+ STRUCT_FLD(status_vars, NULL),
+ STRUCT_FLD(system_vars, NULL),
+ STRUCT_FLD(__reserved1, NULL)
+};
+
+UNIV_INTERN struct st_maria_plugin i_s_innodb_admin_command_maria =
+{
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+ STRUCT_FLD(info, &i_s_info),
+ STRUCT_FLD(name, "XTRADB_ADMIN_COMMAND"),
+ STRUCT_FLD(author, "Percona"),
+ STRUCT_FLD(descr, "XtraDB specific command acceptor"),
+ STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+ STRUCT_FLD(init, i_s_innodb_admin_command_init),
+ STRUCT_FLD(deinit, i_s_common_deinit),
+ STRUCT_FLD(version, 0x0100 /* 1.0 */),
+ STRUCT_FLD(status_vars, NULL),
+ STRUCT_FLD(system_vars, NULL),
+ STRUCT_FLD(version_info, "1.0"),
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE)
+};
+
+/***********************************************************************
+*/
static ST_FIELD_INFO i_s_innodb_table_stats_info[] =
{
{STRUCT_FLD(field_name, "table_schema"),
@@ -3561,7 +3744,7 @@ UNIV_INTERN struct st_mysql_plugin i_s_innodb_table_stats =
STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
STRUCT_FLD(info, &i_s_info),
STRUCT_FLD(name, "INNODB_TABLE_STATS"),
- STRUCT_FLD(author, plugin_author),
+ STRUCT_FLD(author, "Percona"),
STRUCT_FLD(descr, "InnoDB table statistics in memory"),
STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
STRUCT_FLD(init, i_s_innodb_table_stats_init),
@@ -3577,7 +3760,7 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_table_stats_maria =
STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
STRUCT_FLD(info, &i_s_info),
STRUCT_FLD(name, "INNODB_TABLE_STATS"),
- STRUCT_FLD(author, plugin_author),
+ STRUCT_FLD(author, "Percona"),
STRUCT_FLD(descr, "InnoDB table statistics in memory"),
STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
STRUCT_FLD(init, i_s_innodb_table_stats_init),
@@ -3594,7 +3777,7 @@ UNIV_INTERN struct st_mysql_plugin i_s_innodb_index_stats =
STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
STRUCT_FLD(info, &i_s_info),
STRUCT_FLD(name, "INNODB_INDEX_STATS"),
- STRUCT_FLD(author, plugin_author),
+ STRUCT_FLD(author, "Percona"),
STRUCT_FLD(descr, "InnoDB index statistics in memory"),
STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
STRUCT_FLD(init, i_s_innodb_index_stats_init),
@@ -3610,7 +3793,7 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_index_stats_maria =
STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
STRUCT_FLD(info, &i_s_info),
STRUCT_FLD(name, "INNODB_INDEX_STATS"),
- STRUCT_FLD(author, plugin_author),
+ STRUCT_FLD(author, "Percona"),
STRUCT_FLD(descr, "InnoDB index statistics in memory"),
STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
STRUCT_FLD(init, i_s_innodb_index_stats_init),
@@ -3622,188 +3805,6 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_index_stats_maria =
STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE)
};
-/***********************************************************************
-*/
-static ST_FIELD_INFO i_s_innodb_admin_command_info[] =
-{
- {STRUCT_FLD(field_name, "result_message"),
- STRUCT_FLD(field_length, 1024),
- STRUCT_FLD(field_type, MYSQL_TYPE_STRING),
- STRUCT_FLD(value, 0),
- STRUCT_FLD(field_flags, 0),
- STRUCT_FLD(old_name, ""),
- STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
-
- END_OF_ST_FIELD_INFO
-};
-
-#ifndef INNODB_COMPATIBILITY_HOOKS
-#error InnoDB needs MySQL to be built with #define INNODB_COMPATIBILITY_HOOKS
-#endif
-
-extern "C" {
-char **thd_query(MYSQL_THD thd);
-}
-
-static
-int
-i_s_innodb_admin_command_fill(
-/*==========================*/
- THD* thd,
- TABLE_LIST* tables,
- COND* cond)
-{
- TABLE* i_s_table = (TABLE *) tables->table;
- char** query_str;
- char* ptr;
- char quote = '\0';
- const char* command_head = "XTRA_";
-
- DBUG_ENTER("i_s_innodb_admin_command_fill");
-
- /* deny access to non-superusers */
- if (check_global_access(thd, PROCESS_ACL)) {
- DBUG_RETURN(0);
- }
-
- if(thd_sql_command(thd) != SQLCOM_SELECT) {
- field_store_string(i_s_table->field[0],
- "SELECT command is only accepted.");
- goto end_func;
- }
-
- query_str = thd_query(thd);
- ptr = *query_str;
-
- for (; *ptr; ptr++) {
- if (*ptr == quote) {
- quote = '\0';
- } else if (quote) {
- } else if (*ptr == '`' || *ptr == '"') {
- quote = *ptr;
- } else {
- long i;
- for (i = 0; command_head[i]; i++) {
- if (toupper((int)(unsigned char)(ptr[i]))
- != toupper((int)(unsigned char)
- (command_head[i]))) {
- goto nomatch;
- }
- }
- break;
-nomatch:
- ;
- }
- }
-
- if (!*ptr) {
- field_store_string(i_s_table->field[0],
- "No XTRA_* command in the SQL statement."
- " Please add /*!XTRA_xxxx*/ to the SQL.");
- goto end_func;
- }
-
- if (!strncasecmp("XTRA_HELLO", ptr, 10)) {
- /* This is example command XTRA_HELLO */
-
- ut_print_timestamp(stderr);
- fprintf(stderr, " InnoDB: administration command test for XtraDB"
- " 'XTRA_HELLO' was detected.\n");
-
- field_store_string(i_s_table->field[0],
- "Hello!");
- goto end_func;
- }
- else if (!strncasecmp("XTRA_LRU_DUMP", ptr, 13)) {
- ut_print_timestamp(stderr);
- fprintf(stderr, " InnoDB: administration command 'XTRA_LRU_DUMP'"
- " was detected.\n");
-
- if (buf_LRU_file_dump()) {
- field_store_string(i_s_table->field[0],
- "XTRA_LRU_DUMP was succeeded.");
- } else {
- field_store_string(i_s_table->field[0],
- "XTRA_LRU_DUMP was failed.");
- }
-
- goto end_func;
- }
- else if (!strncasecmp("XTRA_LRU_RESTORE", ptr, 16)) {
- ut_print_timestamp(stderr);
- fprintf(stderr, " InnoDB: administration command 'XTRA_LRU_RESTORE'"
- " was detected.\n");
-
- if (buf_LRU_file_restore()) {
- field_store_string(i_s_table->field[0],
- "XTRA_LRU_RESTORE was succeeded.");
- } else {
- field_store_string(i_s_table->field[0],
- "XTRA_LRU_RESTORE was failed.");
- }
-
- goto end_func;
- }
-
- field_store_string(i_s_table->field[0],
- "Undefined XTRA_* command.");
- goto end_func;
-
-end_func:
- if (schema_table_store_record(thd, i_s_table)) {
- DBUG_RETURN(1);
- } else {
- DBUG_RETURN(0);
- }
-}
-
-static
-int
-i_s_innodb_admin_command_init(
-/*==========================*/
- void* p)
-{
- DBUG_ENTER("i_s_innodb_admin_command_init");
- ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
-
- schema->fields_info = i_s_innodb_admin_command_info;
- schema->fill_table = i_s_innodb_admin_command_fill;
-
- DBUG_RETURN(0);
-}
-
-UNIV_INTERN struct st_mysql_plugin i_s_innodb_admin_command =
-{
- STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
- STRUCT_FLD(info, &i_s_info),
- STRUCT_FLD(name, "XTRADB_ADMIN_COMMAND"),
- STRUCT_FLD(author, plugin_author),
- STRUCT_FLD(descr, "XtraDB specific command acceptor"),
- STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
- STRUCT_FLD(init, i_s_innodb_admin_command_init),
- STRUCT_FLD(deinit, i_s_common_deinit),
- STRUCT_FLD(version, 0x0100 /* 1.0 */),
- STRUCT_FLD(status_vars, NULL),
- STRUCT_FLD(system_vars, NULL),
- STRUCT_FLD(__reserved1, NULL)
-};
-
-UNIV_INTERN struct st_maria_plugin i_s_innodb_admin_command_maria =
-{
- STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
- STRUCT_FLD(info, &i_s_info),
- STRUCT_FLD(name, "XTRADB_ADMIN_COMMAND"),
- STRUCT_FLD(author, plugin_author),
- STRUCT_FLD(descr, "XtraDB specific command acceptor"),
- STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
- STRUCT_FLD(init, i_s_innodb_admin_command_init),
- STRUCT_FLD(deinit, i_s_common_deinit),
- STRUCT_FLD(version, 0x0100 /* 1.0 */),
- STRUCT_FLD(status_vars, NULL),
- STRUCT_FLD(system_vars, NULL),
- STRUCT_FLD(version_info, "1.0"),
- STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE)
-};
static ST_FIELD_INFO i_s_innodb_sys_tables_info[] =
{
@@ -4340,15 +4341,14 @@ i_s_innodb_schema_table_fill(
rec = btr_pcur_get_rec(&pcur);
if (!btr_pcur_is_on_user_rec(&pcur)) {
/* end of index */
- btr_pcur_close(&pcur);
- mtr_commit(&mtr);
break;
}
+
+ btr_pcur_store_position(&pcur, &mtr);
+
if (rec_get_deleted_flag(rec, 0)) {
/* record marked as deleted */
- btr_pcur_close(&pcur);
- mtr_commit(&mtr);
- continue;
+ goto next_record;
}
if (id == 0) {
@@ -4359,33 +4359,23 @@ i_s_innodb_schema_table_fill(
status = copy_sys_stats_rec(table, index, rec);
}
if (status) {
- btr_pcur_close(&pcur);
- mtr_commit(&mtr);
break;
}
-#if 0
- btr_pcur_store_position(&pcur, &mtr);
- mtr_commit(&mtr);
-
status = schema_table_store_record(thd, table);
if (status) {
- btr_pcur_close(&pcur);
break;
}
+next_record:
+ mtr_commit(&mtr);
mtr_start(&mtr);
btr_pcur_restore_position(BTR_SEARCH_LEAF, &pcur, &mtr);
-#else
- status = schema_table_store_record(thd, table);
- if (status) {
- btr_pcur_close(&pcur);
- mtr_commit(&mtr);
- break;
- }
-#endif
}
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+
mutex_exit(&(dict_sys->mutex));
DBUG_RETURN(status);
@@ -4441,7 +4431,7 @@ UNIV_INTERN struct st_mysql_plugin i_s_innodb_sys_tables =
STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
STRUCT_FLD(info, &i_s_info),
STRUCT_FLD(name, "INNODB_SYS_TABLES"),
- STRUCT_FLD(author, plugin_author),
+ STRUCT_FLD(author, "Percona"),
STRUCT_FLD(descr, "InnoDB SYS_TABLES table"),
STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
STRUCT_FLD(init, i_s_innodb_sys_tables_init),
@@ -4457,7 +4447,7 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_sys_tables_maria =
STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
STRUCT_FLD(info, &i_s_info),
STRUCT_FLD(name, "INNODB_SYS_TABLES"),
- STRUCT_FLD(author, plugin_author),
+ STRUCT_FLD(author, "Percona"),
STRUCT_FLD(descr, "InnoDB SYS_TABLES table"),
STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
STRUCT_FLD(init, i_s_innodb_sys_tables_init),
@@ -4474,7 +4464,7 @@ UNIV_INTERN struct st_mysql_plugin i_s_innodb_sys_indexes =
STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
STRUCT_FLD(info, &i_s_info),
STRUCT_FLD(name, "INNODB_SYS_INDEXES"),
- STRUCT_FLD(author, plugin_author),
+ STRUCT_FLD(author, "Percona"),
STRUCT_FLD(descr, "InnoDB SYS_INDEXES table"),
STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
STRUCT_FLD(init, i_s_innodb_sys_indexes_init),
@@ -4490,7 +4480,7 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_sys_indexes_maria =
STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
STRUCT_FLD(info, &i_s_info),
STRUCT_FLD(name, "INNODB_SYS_INDEXES"),
- STRUCT_FLD(author, plugin_author),
+ STRUCT_FLD(author, "Percona"),
STRUCT_FLD(descr, "InnoDB SYS_INDEXES table"),
STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
STRUCT_FLD(init, i_s_innodb_sys_indexes_init),
@@ -4507,7 +4497,7 @@ UNIV_INTERN struct st_mysql_plugin i_s_innodb_sys_stats =
STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
STRUCT_FLD(info, &i_s_info),
STRUCT_FLD(name, "INNODB_SYS_STATS"),
- STRUCT_FLD(author, plugin_author),
+ STRUCT_FLD(author, "Percona"),
STRUCT_FLD(descr, "InnoDB SYS_STATS table"),
STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
STRUCT_FLD(init, i_s_innodb_sys_stats_init),
@@ -4523,7 +4513,7 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_sys_stats_maria =
STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
STRUCT_FLD(info, &i_s_info),
STRUCT_FLD(name, "INNODB_SYS_STATS"),
- STRUCT_FLD(author, plugin_author),
+ STRUCT_FLD(author, "Percona"),
STRUCT_FLD(descr, "InnoDB SYS_STATS table"),
STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
STRUCT_FLD(init, i_s_innodb_sys_stats_init),
diff --git a/storage/xtradb/handler/innodb_patch_info.h b/storage/xtradb/handler/innodb_patch_info.h
index e68f12d0fec..38b97411340 100644
--- a/storage/xtradb/handler/innodb_patch_info.h
+++ b/storage/xtradb/handler/innodb_patch_info.h
@@ -47,6 +47,5 @@ struct innodb_enhancement {
{"innodb_fast_checksum","Using the checksum on 32bit-unit calculation","incompatible for unpatched ver.","http://www.percona.com/docs/wiki/percona-xtradb"},
{"innodb_files_extend","allow >4GB transaction log files, and can vary universal page size of datafiles","incompatible for unpatched ver.","http://www.percona.com/docs/wiki/percona-xtradb"},
{"innodb_sys_tables_sys_indexes","Expose InnoDB SYS_TABLES and SYS_INDEXES schema tables","","http://www.percona.com/docs/wiki/percona-xtradb"},
-{"innodb_buffer_pool_shm","Put buffer pool contents to shared memory segment and reuse it at clean restart [experimental]","","http://www.percona.com/docs/wiki/percona-xtradb"},
{NULL, NULL, NULL, NULL}
};
diff --git a/storage/xtradb/ibuf/ibuf0ibuf.c b/storage/xtradb/ibuf/ibuf0ibuf.c
index 3f741da60bb..64dc9a5591d 100644
--- a/storage/xtradb/ibuf/ibuf0ibuf.c
+++ b/storage/xtradb/ibuf/ibuf0ibuf.c
@@ -2613,6 +2613,8 @@ ibuf_insert_low(
ut_a(trx_sys_multiple_tablespace_format);
+ ut_ad(!(thr_get_trx(thr)->fake_changes));
+
do_merge = FALSE;
mutex_enter(&ibuf_mutex);
diff --git a/storage/xtradb/include/btr0btr.h b/storage/xtradb/include/btr0btr.h
index dde3a0bab69..1fe40965c0f 100644
--- a/storage/xtradb/include/btr0btr.h
+++ b/storage/xtradb/include/btr0btr.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -81,6 +81,91 @@ UNIQUE definition on secondary indexes when we decide if we can use
the insert buffer to speed up inserts */
#define BTR_IGNORE_SEC_UNIQUE 2048
+#ifdef UNIV_BLOB_DEBUG
+# include "ut0rbt.h"
+/** An index->blobs entry for keeping track of off-page column references */
+struct btr_blob_dbg_struct
+{
+ unsigned blob_page_no:32; /*!< first BLOB page number */
+ unsigned ref_page_no:32; /*!< referring page number */
+ unsigned ref_heap_no:16; /*!< referring heap number */
+ unsigned ref_field_no:10; /*!< referring field number */
+ unsigned owner:1; /*!< TRUE if BLOB owner */
+ unsigned always_owner:1; /*!< TRUE if always
+ has been the BLOB owner;
+ reset to TRUE on B-tree
+ page splits and merges */
+ unsigned del:1; /*!< TRUE if currently
+ delete-marked */
+};
+
+/**************************************************************//**
+Add a reference to an off-page column to the index->blobs map. */
+UNIV_INTERN
+void
+btr_blob_dbg_add_blob(
+/*==================*/
+ const rec_t* rec, /*!< in: clustered index record */
+ ulint field_no, /*!< in: number of off-page column */
+ ulint page_no, /*!< in: start page of the column */
+ dict_index_t* index, /*!< in/out: index tree */
+ const char* ctx) /*!< in: context (for logging) */
+ __attribute__((nonnull));
+/**************************************************************//**
+Display the references to off-page columns.
+This function is to be called from a debugger,
+for example when a breakpoint on ut_dbg_assertion_failed is hit. */
+UNIV_INTERN
+void
+btr_blob_dbg_print(
+/*===============*/
+ const dict_index_t* index) /*!< in: index tree */
+ __attribute__((nonnull));
+/**************************************************************//**
+Check that there are no references to off-page columns from or to
+the given page. Invoked when freeing or clearing a page.
+@return TRUE when no orphan references exist */
+UNIV_INTERN
+ibool
+btr_blob_dbg_is_empty(
+/*==================*/
+ dict_index_t* index, /*!< in: index */
+ ulint page_no) /*!< in: page number */
+ __attribute__((nonnull, warn_unused_result));
+
+/**************************************************************//**
+Modify the 'deleted' flag of a record. */
+UNIV_INTERN
+void
+btr_blob_dbg_set_deleted_flag(
+/*==========================*/
+ const rec_t* rec, /*!< in: record */
+ dict_index_t* index, /*!< in/out: index */
+ const ulint* offsets,/*!< in: rec_get_offs(rec, index) */
+ ibool del) /*!< in: TRUE=deleted, FALSE=exists */
+ __attribute__((nonnull));
+/**************************************************************//**
+Change the ownership of an off-page column. */
+UNIV_INTERN
+void
+btr_blob_dbg_owner(
+/*===============*/
+ const rec_t* rec, /*!< in: record */
+ dict_index_t* index, /*!< in/out: index */
+ const ulint* offsets,/*!< in: rec_get_offs(rec, index) */
+ ulint i, /*!< in: ith field in rec */
+ ibool own) /*!< in: TRUE=owned, FALSE=disowned */
+ __attribute__((nonnull));
+/** Assert that there are no BLOB references to or from the given page. */
+# define btr_blob_dbg_assert_empty(index, page_no) \
+ ut_a(btr_blob_dbg_is_empty(index, page_no))
+#else /* UNIV_BLOB_DEBUG */
+# define btr_blob_dbg_add_blob(rec, field_no, page, index, ctx) ((void) 0)
+# define btr_blob_dbg_set_deleted_flag(rec, index, offsets, del)((void) 0)
+# define btr_blob_dbg_owner(rec, index, offsets, i, val) ((void) 0)
+# define btr_blob_dbg_assert_empty(index, page_no) ((void) 0)
+#endif /* UNIV_BLOB_DEBUG */
+
/**************************************************************//**
Gets the root node of a tree and x-latches it.
@return root page, x-latched */
@@ -123,6 +208,17 @@ btr_block_get_func(
@return the uncompressed page frame */
# define btr_page_get(space,zip_size,page_no,mode,mtr) \
buf_block_get_frame(btr_block_get(space,zip_size,page_no,mode,mtr))
+/**************************************************************//**
+Sets the index id field of a page. */
+UNIV_INLINE
+void
+btr_page_set_index_id(
+/*==================*/
+ page_t* page, /*!< in: page to be created */
+ page_zip_des_t* page_zip,/*!< in: compressed page whose uncompressed
+ part will be updated, or NULL */
+ dulint id, /*!< in: index id */
+ mtr_t* mtr); /*!< in: mtr */
#endif /* !UNIV_HOTBACKUP */
/**************************************************************//**
Gets the index id field of a page.
@@ -160,6 +256,17 @@ btr_page_get_next(
const page_t* page, /*!< in: index page */
mtr_t* mtr); /*!< in: mini-transaction handle */
/********************************************************//**
+Sets the next index page field. */
+UNIV_INLINE
+void
+btr_page_set_next(
+/*==============*/
+ page_t* page, /*!< in: index page */
+ page_zip_des_t* page_zip,/*!< in: compressed page whose uncompressed
+ part will be updated, or NULL */
+ ulint next, /*!< in: next page number */
+ mtr_t* mtr); /*!< in: mini-transaction handle */
+/********************************************************//**
Gets the previous index page number.
@return prev page number */
UNIV_INLINE
@@ -168,6 +275,17 @@ btr_page_get_prev(
/*==============*/
const page_t* page, /*!< in: index page */
mtr_t* mtr); /*!< in: mini-transaction handle */
+/********************************************************//**
+Sets the previous index page field. */
+UNIV_INLINE
+void
+btr_page_set_prev(
+/*==============*/
+ page_t* page, /*!< in: index page */
+ page_zip_des_t* page_zip,/*!< in: compressed page whose uncompressed
+ part will be updated, or NULL */
+ ulint prev, /*!< in: previous page number */
+ mtr_t* mtr); /*!< in: mini-transaction handle */
/*************************************************************//**
Gets pointer to the previous user record in the tree. It is assumed
that the caller has appropriate latches on the page and its neighbor.
@@ -213,6 +331,18 @@ btr_node_ptr_get_child_page_no(
/*===========================*/
const rec_t* rec, /*!< in: node pointer record */
const ulint* offsets);/*!< in: array returned by rec_get_offsets() */
+/**************************************************************//**
+Creates a new index page (not the root, and also not
+used in page reorganization). @see btr_page_empty(). */
+UNIV_INTERN
+void
+btr_page_create(
+/*============*/
+ buf_block_t* block, /*!< in/out: page to be created */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */
+ dict_index_t* index, /*!< in: index */
+ ulint level, /*!< in: the B-tree level of the page */
+ mtr_t* mtr); /*!< in: mtr */
/************************************************************//**
Creates the root node for a new index tree.
@return page number of the created root, FIL_NULL if did not succeed */
@@ -283,6 +413,17 @@ btr_page_reorganize(
dict_index_t* index, /*!< in: record descriptor */
mtr_t* mtr); /*!< in: mtr */
/*************************************************************//**
+Empties an index page. @see btr_page_create(). */
+UNIV_INTERN
+void
+btr_page_empty(
+/*===========*/
+ buf_block_t* block, /*!< in: page to be emptied */
+ page_zip_des_t* page_zip,/*!< out: compressed page, or NULL */
+ dict_index_t* index, /*!< in: index of the page */
+ ulint level, /*!< in: the B-tree level of the page */
+ mtr_t* mtr); /*!< in: mtr */
+/*************************************************************//**
Decides if the page should be split at the convergence point of
inserts converging to left.
@return TRUE if split recommended */
@@ -341,6 +482,20 @@ btr_insert_on_non_leaf_level_func(
# define btr_insert_on_non_leaf_level(i,l,t,m) \
btr_insert_on_non_leaf_level_func(i,l,t,__FILE__,__LINE__,m)
#endif /* !UNIV_HOTBACKUP */
+/**************************************************************//**
+Attaches the halves of an index page on the appropriate level in an
+index tree. */
+UNIV_INTERN
+void
+btr_attach_half_pages(
+/*==================*/
+ dict_index_t* index, /*!< in: the index tree */
+ buf_block_t* block, /*!< in/out: page to be split */
+ const rec_t* split_rec, /*!< in: first record on upper
+ half page */
+ buf_block_t* new_block, /*!< in/out: the new half page */
+ ulint direction, /*!< in: FSP_UP or FSP_DOWN */
+ mtr_t* mtr); /*!< in: mtr */
/****************************************************************//**
Sets a record as the predefined minimum record. */
UNIV_INTERN
@@ -385,11 +540,14 @@ UNIV_INTERN
ibool
btr_compress(
/*=========*/
- btr_cur_t* cursor, /*!< in: cursor on the page to merge or lift;
- the page must not be empty: in record delete
- use btr_discard_page if the page would become
- empty */
- mtr_t* mtr); /*!< in: mtr */
+ btr_cur_t* cursor, /*!< in/out: cursor on the page to merge
+ or lift; the page must not be empty:
+ when deleting records, use btr_discard_page()
+ if the page would become empty */
+ ibool adjust, /*!< in: TRUE if should adjust the
+ cursor position even if compression occurs */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+ __attribute__((nonnull));
/*************************************************************//**
Discards a page from a B-tree. This is used to remove the last record from
a B-tree page: the whole page must be removed at the same time. This cannot
diff --git a/storage/xtradb/include/btr0cur.h b/storage/xtradb/include/btr0cur.h
index ece3621fa97..6f4ce95d72f 100644
--- a/storage/xtradb/include/btr0cur.h
+++ b/storage/xtradb/include/btr0cur.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -36,6 +36,9 @@ Created 10/16/1994 Heikki Tuuri
#define BTR_NO_LOCKING_FLAG 2 /* do no record lock checking */
#define BTR_KEEP_SYS_FLAG 4 /* sys fields will be found from the
update vector or inserted entry */
+#define BTR_KEEP_POS_FLAG 8 /* btr_cur_pessimistic_update()
+ must keep cursor position when
+ moving columns to big_rec */
#ifndef UNIV_HOTBACKUP
#include "que0types.h"
@@ -309,7 +312,9 @@ btr_cur_pessimistic_update(
/*=======================*/
ulint flags, /*!< in: undo logging, locking, and rollback
flags */
- btr_cur_t* cursor, /*!< in: cursor on the record to update */
+ btr_cur_t* cursor, /*!< in/out: cursor on the record to update;
+ cursor may become invalid if *big_rec == NULL
+ || !(flags & BTR_KEEP_POS_FLAG) */
mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */
big_rec_t** big_rec,/*!< out: big rec vector whose fields have to
be stored externally by the caller, or NULL */
@@ -321,6 +326,16 @@ btr_cur_pessimistic_update(
que_thr_t* thr, /*!< in: query thread */
mtr_t* mtr); /*!< in: mtr; must be committed before
latching any further pages */
+/*****************************************************************
+Commits and restarts a mini-transaction so that it will retain an
+x-lock on index->lock and the cursor page. */
+UNIV_INTERN
+void
+btr_cur_mtr_commit_and_start(
+/*=========================*/
+ btr_cur_t* cursor, /*!< in: cursor */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+ __attribute__((nonnull));
/***********************************************************//**
Marks a clustered index record deleted. Writes an undo log record to
undo log on this delete marking. Writes in the trx id field the id
@@ -376,10 +391,13 @@ UNIV_INTERN
ibool
btr_cur_compress_if_useful(
/*=======================*/
- btr_cur_t* cursor, /*!< in: cursor on the page to compress;
+ btr_cur_t* cursor, /*!< in/out: cursor on the page to compress;
cursor does not stay valid if compression
occurs */
- mtr_t* mtr); /*!< in: mtr */
+ ibool adjust, /*!< in: TRUE if should adjust the
+ cursor position even if compression occurs */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+ __attribute__((nonnull));
/*******************************************************//**
Removes the record on which the tree cursor is positioned. It is assumed
that the mtr has an x-latch on the page where the cursor is positioned,
@@ -652,6 +670,11 @@ struct btr_path_struct{
order); value ULINT_UNDEFINED
denotes array end */
ulint n_recs; /*!< number of records on the page */
+ ulint page_no; /*!< no of the page containing the record */
+ ulint page_level; /*!< level of the page, if later we fetch
+ the page under page_no and it is no different
+ level then we know that the tree has been
+ reorganized */
};
#define BTR_PATH_ARRAY_N_SLOTS 250 /*!< size of path array (in slots) */
diff --git a/storage/xtradb/include/btr0cur.ic b/storage/xtradb/include/btr0cur.ic
index 280583f6ccf..c833b3e8572 100644
--- a/storage/xtradb/include/btr0cur.ic
+++ b/storage/xtradb/include/btr0cur.ic
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -139,7 +139,7 @@ btr_cur_compress_recommendation(
btr_cur_t* cursor, /*!< in: btr cursor */
mtr_t* mtr) /*!< in: mtr */
{
- page_t* page;
+ const page_t* page;
ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor),
MTR_MEMO_PAGE_X_FIX));
diff --git a/storage/xtradb/include/btr0pcur.h b/storage/xtradb/include/btr0pcur.h
index 2334a266280..f59514d04b3 100644
--- a/storage/xtradb/include/btr0pcur.h
+++ b/storage/xtradb/include/btr0pcur.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -244,18 +244,6 @@ btr_pcur_restore_position_func(
mtr_t* mtr); /*!< in: mtr */
#define btr_pcur_restore_position(l,cur,mtr) \
btr_pcur_restore_position_func(l,cur,__FILE__,__LINE__,mtr)
-/**************************************************************//**
-If the latch mode of the cursor is BTR_LEAF_SEARCH or BTR_LEAF_MODIFY,
-releases the page latch and bufferfix reserved by the cursor.
-NOTE! In the case of BTR_LEAF_MODIFY, there should not exist changes
-made by the current mini-transaction to the data protected by the
-cursor latch, as then the latch must not be released until mtr_commit. */
-UNIV_INTERN
-void
-btr_pcur_release_leaf(
-/*==================*/
- btr_pcur_t* cursor, /*!< in: persistent cursor */
- mtr_t* mtr); /*!< in: mtr */
/*********************************************************//**
Gets the rel_pos field for a cursor whose position has been stored.
@return BTR_PCUR_ON, ... */
@@ -282,10 +270,9 @@ btr_pcur_get_mtr(
btr_pcur_t* cursor); /*!< in: persistent cursor */
/**************************************************************//**
Commits the mtr and sets the pcur latch mode to BTR_NO_LATCHES,
-that is, the cursor becomes detached. If there have been modifications
-to the page where pcur is positioned, this can be used instead of
-btr_pcur_release_leaf. Function btr_pcur_store_position should be used
-before calling this, if restoration of cursor is wanted later. */
+that is, the cursor becomes detached.
+Function btr_pcur_store_position should be used before calling this,
+if restoration of cursor is wanted later. */
UNIV_INLINE
void
btr_pcur_commit_specify_mtr(
diff --git a/storage/xtradb/include/btr0pcur.ic b/storage/xtradb/include/btr0pcur.ic
index 0c38797e6c5..0f9b969e7c5 100644
--- a/storage/xtradb/include/btr0pcur.ic
+++ b/storage/xtradb/include/btr0pcur.ic
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -396,10 +396,9 @@ btr_pcur_move_to_next(
/**************************************************************//**
Commits the mtr and sets the pcur latch mode to BTR_NO_LATCHES,
-that is, the cursor becomes detached. If there have been modifications
-to the page where pcur is positioned, this can be used instead of
-btr_pcur_release_leaf. Function btr_pcur_store_position should be used
-before calling this, if restoration of cursor is wanted later. */
+that is, the cursor becomes detached.
+Function btr_pcur_store_position should be used before calling this,
+if restoration of cursor is wanted later. */
UNIV_INLINE
void
btr_pcur_commit_specify_mtr(
diff --git a/storage/xtradb/include/btr0types.h b/storage/xtradb/include/btr0types.h
index ef4a6b04b34..07c06fb18d7 100644
--- a/storage/xtradb/include/btr0types.h
+++ b/storage/xtradb/include/btr0types.h
@@ -38,6 +38,131 @@ typedef struct btr_cur_struct btr_cur_t;
/** B-tree search information for the adaptive hash index */
typedef struct btr_search_struct btr_search_t;
+#ifdef UNIV_BLOB_DEBUG
+# include "buf0types.h"
+/** An index->blobs entry for keeping track of off-page column references */
+typedef struct btr_blob_dbg_struct btr_blob_dbg_t;
+
+/** Insert to index->blobs a reference to an off-page column.
+@param index the index tree
+@param b the reference
+@param ctx context (for logging) */
+UNIV_INTERN
+void
+btr_blob_dbg_rbt_insert(
+/*====================*/
+ dict_index_t* index, /*!< in/out: index tree */
+ const btr_blob_dbg_t* b, /*!< in: the reference */
+ const char* ctx) /*!< in: context (for logging) */
+ __attribute__((nonnull));
+
+/** Remove from index->blobs a reference to an off-page column.
+@param index the index tree
+@param b the reference
+@param ctx context (for logging) */
+UNIV_INTERN
+void
+btr_blob_dbg_rbt_delete(
+/*====================*/
+ dict_index_t* index, /*!< in/out: index tree */
+ const btr_blob_dbg_t* b, /*!< in: the reference */
+ const char* ctx) /*!< in: context (for logging) */
+ __attribute__((nonnull));
+
+/**************************************************************//**
+Add to index->blobs any references to off-page columns from a record.
+@return number of references added */
+UNIV_INTERN
+ulint
+btr_blob_dbg_add_rec(
+/*=================*/
+ const rec_t* rec, /*!< in: record */
+ dict_index_t* index, /*!< in/out: index */
+ const ulint* offsets,/*!< in: offsets */
+ const char* ctx) /*!< in: context (for logging) */
+ __attribute__((nonnull));
+/**************************************************************//**
+Remove from index->blobs any references to off-page columns from a record.
+@return number of references removed */
+UNIV_INTERN
+ulint
+btr_blob_dbg_remove_rec(
+/*====================*/
+ const rec_t* rec, /*!< in: record */
+ dict_index_t* index, /*!< in/out: index */
+ const ulint* offsets,/*!< in: offsets */
+ const char* ctx) /*!< in: context (for logging) */
+ __attribute__((nonnull));
+/**************************************************************//**
+Count and add to index->blobs any references to off-page columns
+from records on a page.
+@return number of references added */
+UNIV_INTERN
+ulint
+btr_blob_dbg_add(
+/*=============*/
+ const page_t* page, /*!< in: rewritten page */
+ dict_index_t* index, /*!< in/out: index */
+ const char* ctx) /*!< in: context (for logging) */
+ __attribute__((nonnull));
+/**************************************************************//**
+Count and remove from index->blobs any references to off-page columns
+from records on a page.
+Used when reorganizing a page, before copying the records.
+@return number of references removed */
+UNIV_INTERN
+ulint
+btr_blob_dbg_remove(
+/*================*/
+ const page_t* page, /*!< in: b-tree page */
+ dict_index_t* index, /*!< in/out: index */
+ const char* ctx) /*!< in: context (for logging) */
+ __attribute__((nonnull));
+/**************************************************************//**
+Restore in index->blobs any references to off-page columns
+Used when page reorganize fails due to compressed page overflow. */
+UNIV_INTERN
+void
+btr_blob_dbg_restore(
+/*=================*/
+ const page_t* npage, /*!< in: page that failed to compress */
+ const page_t* page, /*!< in: copy of original page */
+ dict_index_t* index, /*!< in/out: index */
+ const char* ctx) /*!< in: context (for logging) */
+ __attribute__((nonnull));
+
+/** Operation that processes the BLOB references of an index record
+@param[in] rec record on index page
+@param[in/out] index the index tree of the record
+@param[in] offsets rec_get_offsets(rec,index)
+@param[in] ctx context (for logging)
+@return number of BLOB references processed */
+typedef ulint (*btr_blob_dbg_op_f)
+(const rec_t* rec,dict_index_t* index,const ulint* offsets,const char* ctx);
+
+/**************************************************************//**
+Count and process all references to off-page columns on a page.
+@return number of references processed */
+UNIV_INTERN
+ulint
+btr_blob_dbg_op(
+/*============*/
+ const page_t* page, /*!< in: B-tree leaf page */
+ const rec_t* rec, /*!< in: record to start from
+ (NULL to process the whole page) */
+ dict_index_t* index, /*!< in/out: index */
+ const char* ctx, /*!< in: context (for logging) */
+ const btr_blob_dbg_op_f op) /*!< in: operation on records */
+ __attribute__((nonnull(1,3,4,5)));
+#else /* UNIV_BLOB_DEBUG */
+# define btr_blob_dbg_add_rec(rec, index, offsets, ctx) ((void) 0)
+# define btr_blob_dbg_add(page, index, ctx) ((void) 0)
+# define btr_blob_dbg_remove_rec(rec, index, offsets, ctx) ((void) 0)
+# define btr_blob_dbg_remove(page, index, ctx) ((void) 0)
+# define btr_blob_dbg_restore(npage, page, index, ctx) ((void) 0)
+# define btr_blob_dbg_op(page, rec, index, ctx, op) ((void) 0)
+#endif /* UNIV_BLOB_DEBUG */
+
/** The size of a reference to data stored on a different page.
The reference is stored at the end of the prefix of the field
in the index record. */
diff --git a/storage/xtradb/include/buf0buddy.h b/storage/xtradb/include/buf0buddy.h
index 3a35f8e46e9..f4c3da8692d 100644
--- a/storage/xtradb/include/buf0buddy.h
+++ b/storage/xtradb/include/buf0buddy.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2006, 2011, Oracle and/or its affiliates. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -37,25 +37,20 @@ Created December 2006 by Marko Makela
/**********************************************************************//**
Allocate a block. The thread calling this function must hold
buf_pool_mutex and must not hold buf_pool_zip_mutex or any
-block->mutex. The buf_pool_mutex may only be released and reacquired
-if lru != NULL. This function should only be used for allocating
-compressed page frames or control blocks (buf_page_t). Allocated
-control blocks must be properly initialized immediately after
-buf_buddy_alloc() has returned the memory, before releasing
-buf_pool_mutex.
-@return allocated block, possibly NULL if lru == NULL */
+block->mutex. The buf_pool_mutex may be released and reacquired.
+This function should only be used for allocating compressed page frames.
+@return allocated block, never NULL */
UNIV_INLINE
void*
buf_buddy_alloc(
/*============*/
- ulint size, /*!< in: block size, up to UNIV_PAGE_SIZE */
+ ulint size, /*!< in: compressed page size
+ (between PAGE_ZIP_MIN_SIZE and UNIV_PAGE_SIZE) */
ibool* lru, /*!< in: pointer to a variable that will be assigned
TRUE if storage was allocated from the LRU list
- and buf_pool_mutex was temporarily released,
- or NULL if the LRU list should not be used */
+ and buf_pool_mutex was temporarily released */
ibool have_page_hash_mutex)
- __attribute__((malloc));
-
+ __attribute__((malloc, nonnull));
/**********************************************************************//**
Release a block. */
UNIV_INLINE
diff --git a/storage/xtradb/include/buf0buddy.ic b/storage/xtradb/include/buf0buddy.ic
index 69659fb69d6..63241311e1f 100644
--- a/storage/xtradb/include/buf0buddy.ic
+++ b/storage/xtradb/include/buf0buddy.ic
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2006, 2011, Oracle and/or its affiliates. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -36,8 +36,8 @@ Created December 2006 by Marko Makela
/**********************************************************************//**
Allocate a block. The thread calling this function must hold
buf_pool_mutex and must not hold buf_pool_zip_mutex or any block->mutex.
-The buf_pool_mutex may only be released and reacquired if lru != NULL.
-@return allocated block, possibly NULL if lru==NULL */
+The buf_pool_mutex may be released and reacquired.
+@return allocated block, never NULL */
UNIV_INTERN
void*
buf_buddy_alloc_low(
@@ -46,10 +46,9 @@ buf_buddy_alloc_low(
or BUF_BUDDY_SIZES */
ibool* lru, /*!< in: pointer to a variable that will be assigned
TRUE if storage was allocated from the LRU list
- and buf_pool_mutex was temporarily released,
- or NULL if the LRU list should not be used */
+ and buf_pool_mutex was temporarily released */
ibool have_page_hash_mutex)
- __attribute__((malloc));
+ __attribute__((malloc, nonnull));
/**********************************************************************//**
Deallocate a block. */
@@ -76,6 +75,8 @@ buf_buddy_get_slot(
ulint i;
ulint s;
+ ut_ad(size >= PAGE_ZIP_MIN_SIZE);
+
for (i = 0, s = BUF_BUDDY_LOW; s < size; i++, s <<= 1) {
}
@@ -86,27 +87,26 @@ buf_buddy_get_slot(
/**********************************************************************//**
Allocate a block. The thread calling this function must hold
buf_pool_mutex and must not hold buf_pool_zip_mutex or any
-block->mutex. The buf_pool_mutex may only be released and reacquired
-if lru != NULL. This function should only be used for allocating
-compressed page frames or control blocks (buf_page_t). Allocated
-control blocks must be properly initialized immediately after
-buf_buddy_alloc() has returned the memory, before releasing
-buf_pool_mutex.
-@return allocated block, possibly NULL if lru == NULL */
+block->mutex. The buf_pool_mutex may be released and reacquired.
+This function should only be used for allocating compressed page frames.
+@return allocated block, never NULL */
UNIV_INLINE
void*
buf_buddy_alloc(
/*============*/
- ulint size, /*!< in: block size, up to UNIV_PAGE_SIZE */
+ ulint size, /*!< in: compressed page size
+ (between PAGE_ZIP_MIN_SIZE and UNIV_PAGE_SIZE) */
ibool* lru, /*!< in: pointer to a variable that will be assigned
TRUE if storage was allocated from the LRU list
- and buf_pool_mutex was temporarily released,
- or NULL if the LRU list should not be used */
+ and buf_pool_mutex was temporarily released */
ibool have_page_hash_mutex)
{
//ut_ad(buf_pool_mutex_own());
+ ut_ad(ut_is_2pow(size));
+ ut_ad(size >= PAGE_ZIP_MIN_SIZE);
+ ut_ad(size <= UNIV_PAGE_SIZE);
- return(buf_buddy_alloc_low(buf_buddy_get_slot(size), lru, have_page_hash_mutex));
+ return((byte*) buf_buddy_alloc_low(buf_buddy_get_slot(size), lru, have_page_hash_mutex));
}
/**********************************************************************//**
@@ -121,6 +121,9 @@ buf_buddy_free(
ibool have_page_hash_mutex)
{
//ut_ad(buf_pool_mutex_own());
+ ut_ad(ut_is_2pow(size));
+ ut_ad(size >= PAGE_ZIP_MIN_SIZE);
+ ut_ad(size <= UNIV_PAGE_SIZE);
if (!have_page_hash_mutex) {
mutex_enter(&LRU_list_mutex);
diff --git a/storage/xtradb/include/buf0buf.h b/storage/xtradb/include/buf0buf.h
index bc0e9170281..838dd7f3900 100644
--- a/storage/xtradb/include/buf0buf.h
+++ b/storage/xtradb/include/buf0buf.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -36,12 +36,13 @@ Created 11/5/1995 Heikki Tuuri
#include "ut0rbt.h"
#ifndef UNIV_HOTBACKUP
#include "os0proc.h"
-#include "srv0srv.h"
/** @name Modes for buf_page_get_gen */
/* @{ */
#define BUF_GET 10 /*!< get always */
#define BUF_GET_IF_IN_POOL 11 /*!< get if in pool */
+#define BUF_PEEK_IF_IN_POOL 12 /*!< get if in pool, do not make
+ the block young in the LRU list */
#define BUF_GET_NO_LATCH 14 /*!< get and bufferfix, but
set no latch; we have
separated this case, because
@@ -140,12 +141,6 @@ buf_relocate(
BUF_BLOCK_ZIP_DIRTY or BUF_BLOCK_ZIP_PAGE */
buf_page_t* dpage) /*!< in/out: destination control block */
__attribute__((nonnull));
-/********************************************************************//**
-Resizes the buffer pool. */
-UNIV_INTERN
-void
-buf_pool_resize(void);
-/*=================*/
/*********************************************************************//**
Gets the current size of buffer buf_pool in bytes.
@return size in bytes */
@@ -162,6 +157,23 @@ ib_uint64_t
buf_pool_get_oldest_modification(void);
/*==================================*/
/********************************************************************//**
+Allocates a buf_page_t descriptor. This function must succeed. In case
+of failure we assert in this function. */
+UNIV_INLINE
+buf_page_t*
+buf_page_alloc_descriptor(void)
+/*===========================*/
+ __attribute__((malloc));
+/********************************************************************//**
+Free a buf_page_t descriptor. */
+UNIV_INLINE
+void
+buf_page_free_descriptor(
+/*=====================*/
+ buf_page_t* bpage) /*!< in: bpage descriptor to free. */
+ __attribute__((nonnull));
+
+/********************************************************************//**
Allocates a buffer block.
@return own: the allocated block, in state BUF_BLOCK_MEMORY */
UNIV_INLINE
@@ -285,7 +297,7 @@ buf_page_get_gen(
ulint rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH */
buf_block_t* guess, /*!< in: guessed block or NULL */
ulint mode, /*!< in: BUF_GET, BUF_GET_IF_IN_POOL,
- BUF_GET_NO_LATCH */
+ BUF_PEEK_IF_IN_POOL, BUF_GET_NO_LATCH */
const char* file, /*!< in: file name */
ulint line, /*!< in: line where called */
mtr_t* mtr); /*!< in: mini-transaction */
@@ -415,6 +427,18 @@ buf_block_get_freed_page_clock(
__attribute__((pure));
/********************************************************************//**
+Tells if a block is still close enough to the MRU end of the LRU list
+meaning that it is not in danger of getting evicted and also implying
+that it has been accessed recently.
+Note that this is for heuristics only and does not reserve buffer pool
+mutex.
+@return TRUE if block is close to MRU end of LRU */
+UNIV_INLINE
+ibool
+buf_page_peek_if_young(
+/*===================*/
+ const buf_page_t* bpage); /*!< in: block */
+/********************************************************************//**
Recommends a move of a block to the start of the LRU list if there is danger
of dropping from the buffer pool. NOTE: does not reserve the buffer pool
mutex.
@@ -466,6 +490,31 @@ buf_block_get_modify_clock(
#else /* !UNIV_HOTBACKUP */
# define buf_block_modify_clock_inc(block) ((void) 0)
#endif /* !UNIV_HOTBACKUP */
+/*******************************************************************//**
+Increments the bufferfix count. */
+UNIV_INLINE
+void
+buf_block_buf_fix_inc_func(
+/*=======================*/
+#ifdef UNIV_SYNC_DEBUG
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line */
+#endif /* UNIV_SYNC_DEBUG */
+ buf_block_t* block) /*!< in/out: block to bufferfix */
+ __attribute__((nonnull));
+#ifdef UNIV_SYNC_DEBUG
+/** Increments the bufferfix count.
+@param b in/out: block to bufferfix
+@param f in: file name where requested
+@param l in: line number where requested */
+# define buf_block_buf_fix_inc(b,f,l) buf_block_buf_fix_inc_func(f,l,b)
+#else /* UNIV_SYNC_DEBUG */
+/** Increments the bufferfix count.
+@param b in/out: block to bufferfix
+@param f in: file name where requested
+@param l in: line number where requested */
+# define buf_block_buf_fix_inc(b,f,l) buf_block_buf_fix_inc_func(b)
+#endif /* UNIV_SYNC_DEBUG */
/********************************************************************//**
Calculates a page checksum which is stored to the page when it is written
to a file. Note that we must be careful to calculate the same value
@@ -986,8 +1035,7 @@ UNIV_INTERN
void
buf_page_io_complete(
/*=================*/
- buf_page_t* bpage, /*!< in: pointer to the block in question */
- trx_t* trx);
+ buf_page_t* bpage); /*!< in: pointer to the block in question */
/********************************************************************//**
Calculates a folded value of a file page address to use in the page hash
table.
@@ -1301,10 +1349,7 @@ struct buf_block_struct{
/**********************************************************************//**
Compute the hash fold value for blocks in buf_pool->zip_hash. */
/* @{ */
-/* the fold should be relative when srv_buffer_pool_shm_key is enabled */
-#define BUF_POOL_ZIP_FOLD_PTR(ptr) (!srv_buffer_pool_shm_key\
- ?((ulint) (ptr) / UNIV_PAGE_SIZE)\
- :((ulint) ((byte*)ptr - (byte*)(buf_pool->chunks->blocks->frame)) / UNIV_PAGE_SIZE))
+#define BUF_POOL_ZIP_FOLD_PTR(ptr) ((ulint) (ptr) / UNIV_PAGE_SIZE)
#define BUF_POOL_ZIP_FOLD(b) BUF_POOL_ZIP_FOLD_PTR((b)->frame)
#define BUF_POOL_ZIP_FOLD_BPAGE(b) BUF_POOL_ZIP_FOLD((buf_block_t*) (b))
/* @} */
@@ -1330,6 +1375,8 @@ struct buf_pool_stat_struct{
ulint n_pages_written;/*!< number write operations */
ulint n_pages_created;/*!< number of pages created
in the pool with no read */
+ ulint n_ra_pages_read_rnd;/*!< number of pages read in
+ as part of random read ahead */
ulint n_ra_pages_read;/*!< number of pages read in
as part of read ahead */
ulint n_ra_pages_evicted;/*!< number of read ahead
@@ -1453,8 +1500,10 @@ struct buf_pool_struct{
frames and buf_page_t descriptors of blocks that exist
in the buffer pool only in compressed form. */
/* @{ */
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
UT_LIST_BASE_NODE_T(buf_page_t) zip_clean;
/*!< unmodified compressed pages */
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
UT_LIST_BASE_NODE_T(buf_page_t) zip_free[BUF_BUDDY_SIZES_MAX];
/*!< buddy free lists */
//#if BUF_BUDDY_HIGH != UNIV_PAGE_SIZE
@@ -1486,8 +1535,8 @@ Use these instead of accessing buf_pool_mutex directly. */
/** Test if buf_pool_mutex is owned. */
#define buf_pool_mutex_own() mutex_own(&buf_pool_mutex)
/** Acquire the buffer pool mutex. */
+/* the buf_pool_mutex is changed the latch order */
#define buf_pool_mutex_enter() do { \
- ut_ad(!mutex_own(&buf_pool_zip_mutex)); \
mutex_enter(&buf_pool_mutex); \
} while (0)
diff --git a/storage/xtradb/include/buf0buf.ic b/storage/xtradb/include/buf0buf.ic
index 2cb0d8ef497..a081d6a34c0 100644
--- a/storage/xtradb/include/buf0buf.ic
+++ b/storage/xtradb/include/buf0buf.ic
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2008, Google Inc.
Portions of this file contain modifications contributed and copyrighted by
@@ -62,6 +62,27 @@ buf_block_get_freed_page_clock(
}
/********************************************************************//**
+Tells if a block is still close enough to the MRU end of the LRU list
+meaning that it is not in danger of getting evicted and also implying
+that it has been accessed recently.
+Note that this is for heuristics only and does not reserve buffer pool
+mutex.
+@return TRUE if block is close to MRU end of LRU */
+UNIV_INLINE
+ibool
+buf_page_peek_if_young(
+/*===================*/
+ const buf_page_t* bpage) /*!< in: block */
+{
+ /* FIXME: bpage->freed_page_clock is 31 bits */
+ return((buf_pool->freed_page_clock & ((1UL << 31) - 1))
+ < ((ulint) bpage->freed_page_clock
+ + (buf_pool->curr_size
+ * (BUF_LRU_OLD_RATIO_DIV - buf_LRU_old_ratio)
+ / (BUF_LRU_OLD_RATIO_DIV * 4))));
+}
+
+/********************************************************************//**
Recommends a move of a block to the start of the LRU list if there is danger
of dropping from the buffer pool. NOTE: does not reserve the buffer pool
mutex.
@@ -89,12 +110,7 @@ buf_page_peek_if_too_old(
buf_pool->stat.n_pages_not_made_young++;
return(FALSE);
} else {
- /* FIXME: bpage->freed_page_clock is 31 bits */
- return((buf_pool->freed_page_clock & ((1UL << 31) - 1))
- > ((ulint) bpage->freed_page_clock
- + (buf_pool->curr_size
- * (BUF_LRU_OLD_RATIO_DIV - buf_LRU_old_ratio)
- / (BUF_LRU_OLD_RATIO_DIV * 4))));
+ return(!buf_page_peek_if_young(bpage));
}
}
@@ -754,6 +770,35 @@ buf_block_get_lock_hash_val(
}
/********************************************************************//**
+Allocates a buf_page_t descriptor. This function must succeed. In case
+of failure we assert in this function.
+@return: the allocated descriptor. */
+UNIV_INLINE
+buf_page_t*
+buf_page_alloc_descriptor(void)
+/*===========================*/
+{
+ buf_page_t* bpage;
+
+ bpage = (buf_page_t*) ut_malloc(sizeof *bpage);
+ ut_d(memset(bpage, 0, sizeof *bpage));
+ UNIV_MEM_ALLOC(bpage, sizeof *bpage);
+
+ return(bpage);
+}
+
+/********************************************************************//**
+Free a buf_page_t descriptor. */
+UNIV_INLINE
+void
+buf_page_free_descriptor(
+/*=====================*/
+ buf_page_t* bpage) /*!< in: bpage descriptor to free. */
+{
+ ut_free(bpage);
+}
+
+/********************************************************************//**
Allocates a buffer block.
@return own: the allocated block, in state BUF_BLOCK_MEMORY */
UNIV_INLINE
@@ -910,19 +955,6 @@ buf_block_buf_fix_inc_func(
block->page.buf_fix_count++;
}
-#ifdef UNIV_SYNC_DEBUG
-/** Increments the bufferfix count.
-@param b in/out: block to bufferfix
-@param f in: file name where requested
-@param l in: line number where requested */
-# define buf_block_buf_fix_inc(b,f,l) buf_block_buf_fix_inc_func(f,l,b)
-#else /* UNIV_SYNC_DEBUG */
-/** Increments the bufferfix count.
-@param b in/out: block to bufferfix
-@param f in: file name where requested
-@param l in: line number where requested */
-# define buf_block_buf_fix_inc(b,f,l) buf_block_buf_fix_inc_func(b)
-#endif /* UNIV_SYNC_DEBUG */
/*******************************************************************//**
Decrements the bufferfix count. */
@@ -1119,7 +1151,7 @@ buf_block_dbg_add_level(
where we have acquired latch */
ulint level) /*!< in: latching order level */
{
- sync_thread_add_level(&block->lock, level);
+ sync_thread_add_level(&block->lock, level, FALSE);
}
#endif /* UNIV_SYNC_DEBUG */
#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/include/buf0lru.h b/storage/xtradb/include/buf0lru.h
index fe7c067dfb7..8abebfb675c 100644
--- a/storage/xtradb/include/buf0lru.h
+++ b/storage/xtradb/include/buf0lru.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -30,18 +30,6 @@ Created 11/5/1995 Heikki Tuuri
#include "ut0byte.h"
#include "buf0types.h"
-/** The return type of buf_LRU_free_block() */
-enum buf_lru_free_block_status {
- /** freed */
- BUF_LRU_FREED = 0,
- /** not freed because the caller asked to remove the
- uncompressed frame but the control block cannot be
- relocated */
- BUF_LRU_CANNOT_RELOCATE,
- /** not freed because of some other reason */
- BUF_LRU_NOT_FREED
-};
-
/******************************************************************//**
Tries to remove LRU flushed blocks from the end of the LRU list and put them
to the free list. This is beneficial for the efficiency of the insert buffer
@@ -91,6 +79,7 @@ void
buf_LRU_mark_space_was_deleted(
/*===========================*/
ulint id); /*!< in: space id */
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
/********************************************************************//**
Insert a compressed block into buf_pool->zip_clean in the LRU order. */
UNIV_INTERN
@@ -98,22 +87,22 @@ void
buf_LRU_insert_zip_clean(
/*=====================*/
buf_page_t* bpage); /*!< in: pointer to the block in question */
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
/******************************************************************//**
Try to free a block. If bpage is a descriptor of a compressed-only
page, the descriptor object will be freed as well.
-NOTE: If this function returns BUF_LRU_FREED, it will temporarily
+NOTE: If this function returns TRUE, it will temporarily
release buf_pool_mutex. Furthermore, the page frame will no longer be
accessible via bpage.
The caller must hold buf_pool_mutex and buf_page_get_mutex(bpage) and
release these two mutexes after the call. No other
buf_page_get_mutex() may be held when calling this function.
-@return BUF_LRU_FREED if freed, BUF_LRU_CANNOT_RELOCATE or
-BUF_LRU_NOT_FREED otherwise. */
+@return TRUE if freed, FALSE otherwise. */
UNIV_INTERN
-enum buf_lru_free_block_status
+ibool
buf_LRU_free_block(
/*===============*/
buf_page_t* bpage, /*!< in: block to be freed */
diff --git a/storage/xtradb/include/buf0types.h b/storage/xtradb/include/buf0types.h
index ce3e5ecc9c5..9dd847bdaca 100644
--- a/storage/xtradb/include/buf0types.h
+++ b/storage/xtradb/include/buf0types.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -26,6 +26,8 @@ Created 11/17/1995 Heikki Tuuri
#ifndef buf0types_h
#define buf0types_h
+#include "page0types.h"
+
/** Buffer page (uncompressed or compressed) */
typedef struct buf_page_struct buf_page_t;
/** Buffer block for which an uncompressed page exists */
@@ -58,17 +60,10 @@ enum buf_io_fix {
/** Parameters of binary buddy system for compressed pages (buf0buddy.h) */
/* @{ */
-#if UNIV_WORD_SIZE <= 4 /* 32-bit system */
-/** Base-2 logarithm of the smallest buddy block size */
-# define BUF_BUDDY_LOW_SHIFT 6
-#else /* 64-bit system */
-/** Base-2 logarithm of the smallest buddy block size */
-# define BUF_BUDDY_LOW_SHIFT 7
-#endif
+#define BUF_BUDDY_LOW_SHIFT PAGE_ZIP_MIN_SIZE_SHIFT
+
#define BUF_BUDDY_LOW (1 << BUF_BUDDY_LOW_SHIFT)
- /*!< minimum block size in the binary
- buddy system; must be at least
- sizeof(buf_page_t) */
+
#define BUF_BUDDY_SIZES (UNIV_PAGE_SIZE_SHIFT - BUF_BUDDY_LOW_SHIFT)
#define BUF_BUDDY_SIZES_MAX (UNIV_PAGE_SIZE_SHIFT_MAX - BUF_BUDDY_LOW_SHIFT)
/*!< number of buddy sizes */
diff --git a/storage/xtradb/include/dict0mem.h b/storage/xtradb/include/dict0mem.h
index f47293bedf6..3554274847c 100644
--- a/storage/xtradb/include/dict0mem.h
+++ b/storage/xtradb/include/dict0mem.h
@@ -340,6 +340,13 @@ struct dict_index_struct{
index, or 0 if the index existed
when InnoDB was started up */
#endif /* !UNIV_HOTBACKUP */
+#ifdef UNIV_BLOB_DEBUG
+ mutex_t blobs_mutex;
+ /*!< mutex protecting blobs */
+ void* blobs; /*!< map of (page_no,heap_no,field_no)
+ to first_blob_page_no; protected by
+ blobs_mutex; @see btr_blob_dbg_t */
+#endif /* UNIV_BLOB_DEBUG */
#ifdef UNIV_DEBUG
ulint magic_n;/*!< magic number */
/** Value of dict_index_struct::magic_n */
diff --git a/storage/xtradb/include/fil0fil.h b/storage/xtradb/include/fil0fil.h
index fbf8ca20db3..11c4cb4ba03 100644
--- a/storage/xtradb/include/fil0fil.h
+++ b/storage/xtradb/include/fil0fil.h
@@ -670,8 +670,9 @@ UNIV_INTERN
void
fil_flush(
/*======*/
- ulint space_id); /*!< in: file space id (this can be a group of
+ ulint space_id, /*!< in: file space id (this can be a group of
log files or a tablespace of the database) */
+ ibool metadata);
/**********************************************************************//**
Flushes to disk writes in file spaces of the given type possibly cached by
the OS. */
diff --git a/storage/xtradb/include/hash0hash.h b/storage/xtradb/include/hash0hash.h
index 492c767acc4..b17c21a45ef 100644
--- a/storage/xtradb/include/hash0hash.h
+++ b/storage/xtradb/include/hash0hash.h
@@ -49,28 +49,6 @@ hash_table_t*
hash_create(
/*========*/
ulint n); /*!< in: number of array cells */
-
-/*************************************************************//**
-*/
-UNIV_INTERN
-ulint
-hash_create_needed(
-/*===============*/
- ulint n);
-
-UNIV_INTERN
-void
-hash_create_init(
-/*=============*/
- hash_table_t* table,
- ulint n);
-
-UNIV_INTERN
-void
-hash_create_reuse(
-/*==============*/
- hash_table_t* table);
-
#ifndef UNIV_HOTBACKUP
/*************************************************************//**
Creates a mutex array to protect a hash table. */
@@ -350,33 +328,6 @@ do {\
}\
} while (0)
-/********************************************************************//**
-Align nodes with moving location.*/
-#define HASH_OFFSET(TABLE, NODE_TYPE, PTR_NAME, FADDR, FOFFSET, BOFFSET) \
-do {\
- ulint i2222;\
- ulint cell_count2222;\
-\
- cell_count2222 = hash_get_n_cells(TABLE);\
-\
- for (i2222 = 0; i2222 < cell_count2222; i2222++) {\
- NODE_TYPE* node2222;\
-\
- if ((TABLE)->array[i2222].node) \
- (TABLE)->array[i2222].node = (void*)((byte*)(TABLE)->array[i2222].node \
- + (((TABLE)->array[i2222].node > (void*)FADDR)?FOFFSET:BOFFSET));\
- node2222 = HASH_GET_FIRST((TABLE), i2222);\
-\
- while (node2222) {\
- if (node2222->PTR_NAME) \
- node2222->PTR_NAME = (void*)((byte*)(node2222->PTR_NAME) \
- + ((((void*)node2222->PTR_NAME) > (void*)FADDR)?FOFFSET:BOFFSET));\
-\
- node2222 = node2222->PTR_NAME;\
- }\
- }\
-} while (0)
-
/************************************************************//**
Gets the mutex index for a fold value in a hash table.
@return mutex number */
diff --git a/storage/xtradb/include/mtr0mtr.h b/storage/xtradb/include/mtr0mtr.h
index bc3f1951be9..8a9ec8ea7f0 100644
--- a/storage/xtradb/include/mtr0mtr.h
+++ b/storage/xtradb/include/mtr0mtr.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -213,16 +213,6 @@ ulint
mtr_set_savepoint(
/*==============*/
mtr_t* mtr); /*!< in: mtr */
-/**********************************************************//**
-Releases the latches stored in an mtr memo down to a savepoint.
-NOTE! The mtr must not have made changes to buffer pages after the
-savepoint, as these can be handled only by mtr_commit. */
-UNIV_INTERN
-void
-mtr_rollback_to_savepoint(
-/*======================*/
- mtr_t* mtr, /*!< in: mtr */
- ulint savepoint); /*!< in: savepoint */
#ifndef UNIV_HOTBACKUP
/**********************************************************//**
Releases the (index tree) s-latch stored in an mtr memo after a
diff --git a/storage/xtradb/include/os0file.h b/storage/xtradb/include/os0file.h
index 46bda4c6b45..5db7dc88b8f 100644
--- a/storage/xtradb/include/os0file.h
+++ b/storage/xtradb/include/os0file.h
@@ -473,7 +473,8 @@ UNIV_INTERN
ibool
os_file_flush(
/*==========*/
- os_file_t file); /*!< in, own: handle to a file */
+ os_file_t file, /*!< in, own: handle to a file */
+ ibool metadata);
/***********************************************************************//**
Retrieves the last error number if an error occurs in a file io function.
The number should be retrieved before any other OS calls (because they may
diff --git a/storage/xtradb/include/os0proc.h b/storage/xtradb/include/os0proc.h
index 582cef6f803..fd46bd7db87 100644
--- a/storage/xtradb/include/os0proc.h
+++ b/storage/xtradb/include/os0proc.h
@@ -32,11 +32,6 @@ Created 9/30/1995 Heikki Tuuri
#ifdef UNIV_LINUX
#include <sys/ipc.h>
#include <sys/shm.h>
-#else
-# if defined HAVE_SYS_IPC_H && HAVE_SYS_SHM_H
-#include <sys/ipc.h>
-#include <sys/shm.h>
-# endif
#endif
typedef void* os_process_t;
@@ -75,29 +70,6 @@ os_mem_free_large(
ulint size); /*!< in: size returned by
os_mem_alloc_large() */
-
-/****************************************************************//**
-Allocates or attaches and reuses shared memory segment.
-The content is not cleared automatically.
-@return allocated memory */
-UNIV_INTERN
-void*
-os_shm_alloc(
-/*=========*/
- ulint* n, /*!< in/out: number of bytes */
- uint key,
- ibool* is_new);
-
-/****************************************************************//**
-Detach shared memory segment. */
-UNIV_INTERN
-void
-os_shm_free(
-/*========*/
- void *ptr, /*!< in: pointer returned by
- os_shm_alloc() */
- ulint size); /*!< in: size returned by
- os_shm_alloc() */
#ifndef UNIV_NONINL
#include "os0proc.ic"
#endif
diff --git a/storage/xtradb/include/page0cur.ic b/storage/xtradb/include/page0cur.ic
index 3520677dfb3..81474fa35f5 100644
--- a/storage/xtradb/include/page0cur.ic
+++ b/storage/xtradb/include/page0cur.ic
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -27,6 +27,8 @@ Created 10/4/1994 Heikki Tuuri
#include "buf0types.h"
#ifdef UNIV_DEBUG
+# include "rem0cmp.h"
+
/*********************************************************//**
Gets pointer to the page frame where the cursor is positioned.
@return page */
@@ -268,6 +270,7 @@ page_cur_tuple_insert(
index, rec, offsets, mtr);
}
+ ut_ad(!rec || !cmp_dtuple_rec(tuple, rec, offsets));
mem_heap_free(heap);
return(rec);
}
diff --git a/storage/xtradb/include/page0page.h b/storage/xtradb/include/page0page.h
index 5b2bcf7c054..aeaef030505 100644
--- a/storage/xtradb/include/page0page.h
+++ b/storage/xtradb/include/page0page.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -284,16 +284,42 @@ page_get_supremum_offset(
const page_t* page); /*!< in: page which must have record(s) */
#define page_get_infimum_rec(page) ((page) + page_get_infimum_offset(page))
#define page_get_supremum_rec(page) ((page) + page_get_supremum_offset(page))
+
/************************************************************//**
-Returns the middle record of record list. If there are an even number
-of records in the list, returns the first record of upper half-list.
-@return middle record */
+Returns the nth record of the record list.
+This is the inverse function of page_rec_get_n_recs_before().
+@return nth record */
UNIV_INTERN
+const rec_t*
+page_rec_get_nth_const(
+/*===================*/
+ const page_t* page, /*!< in: page */
+ ulint nth) /*!< in: nth record */
+ __attribute__((nonnull, warn_unused_result));
+/************************************************************//**
+Returns the nth record of the record list.
+This is the inverse function of page_rec_get_n_recs_before().
+@return nth record */
+UNIV_INLINE
+rec_t*
+page_rec_get_nth(
+/*=============*/
+ page_t* page, /*< in: page */
+ ulint nth) /*!< in: nth record */
+ __attribute__((nonnull, warn_unused_result));
+
+#ifndef UNIV_HOTBACKUP
+/************************************************************//**
+Returns the middle record of the records on the page. If there is an
+even number of records in the list, returns the first record of the
+upper half-list.
+@return middle record */
+UNIV_INLINE
rec_t*
page_get_middle_rec(
/*================*/
- page_t* page); /*!< in: page */
-#ifndef UNIV_HOTBACKUP
+ page_t* page) /*!< in: page */
+ __attribute__((nonnull, warn_unused_result));
/*************************************************************//**
Compares a data tuple to a physical record. Differs from the function
cmp_dtuple_rec_with_match in the way that the record must reside on an
@@ -348,6 +374,7 @@ page_get_n_recs(
/***************************************************************//**
Returns the number of records before the given record in chain.
The number includes infimum and supremum records.
+This is the inverse function of page_rec_get_nth().
@return number of records */
UNIV_INTERN
ulint
diff --git a/storage/xtradb/include/page0page.ic b/storage/xtradb/include/page0page.ic
index dab9dc742e4..b34408aed17 100644
--- a/storage/xtradb/include/page0page.ic
+++ b/storage/xtradb/include/page0page.ic
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -423,7 +423,37 @@ page_rec_is_infimum(
return(page_rec_is_infimum_low(page_offset(rec)));
}
+/************************************************************//**
+Returns the nth record of the record list.
+This is the inverse function of page_rec_get_n_recs_before().
+@return nth record */
+UNIV_INLINE
+rec_t*
+page_rec_get_nth(
+/*=============*/
+ page_t* page, /*!< in: page */
+ ulint nth) /*!< in: nth record */
+{
+ return((rec_t*) page_rec_get_nth_const(page, nth));
+}
+
#ifndef UNIV_HOTBACKUP
+/************************************************************//**
+Returns the middle record of the records on the page. If there is an
+even number of records in the list, returns the first record of the
+upper half-list.
+@return middle record */
+UNIV_INLINE
+rec_t*
+page_get_middle_rec(
+/*================*/
+ page_t* page) /*!< in: page */
+{
+ ulint middle = (page_get_n_recs(page) + PAGE_HEAP_NO_USER_LOW) / 2;
+
+ return(page_rec_get_nth(page, middle));
+}
+
/*************************************************************//**
Compares a data tuple to a physical record. Differs from the function
cmp_dtuple_rec_with_match in the way that the record must reside on an
diff --git a/storage/xtradb/include/page0zip.h b/storage/xtradb/include/page0zip.h
index 4d37302ed20..fe3d2e52e0b 100644
--- a/storage/xtradb/include/page0zip.h
+++ b/storage/xtradb/include/page0zip.h
@@ -420,7 +420,7 @@ page_zip_copy_recs(
const page_t* src, /*!< in: page */
dict_index_t* index, /*!< in: index of the B-tree */
mtr_t* mtr) /*!< in: mini-transaction */
- __attribute__((nonnull(1,2,3,4)));
+ __attribute__((nonnull));
#endif /* !UNIV_HOTBACKUP */
/**********************************************************************//**
diff --git a/storage/xtradb/include/rem0rec.h b/storage/xtradb/include/rem0rec.h
index 17d08afabb9..06de23be757 100644
--- a/storage/xtradb/include/rem0rec.h
+++ b/storage/xtradb/include/rem0rec.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -480,6 +480,18 @@ ulint
rec_offs_any_extern(
/*================*/
const ulint* offsets);/*!< in: array returned by rec_get_offsets() */
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+/******************************************************//**
+Determine if the offsets are for a record containing null BLOB pointers.
+@return first field containing a null BLOB pointer, or NULL if none found */
+UNIV_INLINE
+const byte*
+rec_offs_any_null_extern(
+/*=====================*/
+ const rec_t* rec, /*!< in: record */
+ const ulint* offsets) /*!< in: rec_get_offsets(rec) */
+ __attribute__((nonnull, warn_unused_result));
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
/******************************************************//**
Returns nonzero if the extern bit is set in nth field of rec.
@return nonzero if externally stored */
diff --git a/storage/xtradb/include/rem0rec.ic b/storage/xtradb/include/rem0rec.ic
index fa96c97f95e..3875a4cf814 100644
--- a/storage/xtradb/include/rem0rec.ic
+++ b/storage/xtradb/include/rem0rec.ic
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -26,6 +26,7 @@ Created 5/30/1994 Heikki Tuuri
#include "mach0data.h"
#include "ut0byte.h"
#include "dict0dict.h"
+#include "btr0types.h"
/* Compact flag ORed to the extra size returned by rec_get_offsets() */
#define REC_OFFS_COMPACT ((ulint) 1 << 31)
@@ -1087,6 +1088,44 @@ rec_offs_any_extern(
return(UNIV_UNLIKELY(*rec_offs_base(offsets) & REC_OFFS_EXTERNAL));
}
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+/******************************************************//**
+Determine if the offsets are for a record containing null BLOB pointers.
+@return first field containing a null BLOB pointer, or NULL if none found */
+UNIV_INLINE
+const byte*
+rec_offs_any_null_extern(
+/*=====================*/
+ const rec_t* rec, /*!< in: record */
+ const ulint* offsets) /*!< in: rec_get_offsets(rec) */
+{
+ ulint i;
+ ut_ad(rec_offs_validate(rec, NULL, offsets));
+
+ if (!rec_offs_any_extern(offsets)) {
+ return(NULL);
+ }
+
+ for (i = 0; i < rec_offs_n_fields(offsets); i++) {
+ if (rec_offs_nth_extern(offsets, i)) {
+ ulint len;
+ const byte* field
+ = rec_get_nth_field(rec, offsets, i, &len);
+
+ ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
+ if (!memcmp(field + len
+ - BTR_EXTERN_FIELD_REF_SIZE,
+ field_ref_zero,
+ BTR_EXTERN_FIELD_REF_SIZE)) {
+ return(field);
+ }
+ }
+ }
+
+ return(NULL);
+}
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
+
/******************************************************//**
Returns nonzero if the extern bit is set in nth field of rec.
@return nonzero if externally stored */
diff --git a/storage/xtradb/include/row0row.h b/storage/xtradb/include/row0row.h
index 723b7b53395..36fb26482ce 100644
--- a/storage/xtradb/include/row0row.h
+++ b/storage/xtradb/include/row0row.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -38,16 +38,16 @@ Created 4/20/1996 Heikki Tuuri
#include "btr0types.h"
/*********************************************************************//**
-Gets the offset of the trx id field, in bytes relative to the origin of
+Gets the offset of the DB_TRX_ID field, in bytes relative to the origin of
a clustered index record.
@return offset of DATA_TRX_ID */
-UNIV_INTERN
+UNIV_INLINE
ulint
row_get_trx_id_offset(
/*==================*/
- const rec_t* rec, /*!< in: record */
- dict_index_t* index, /*!< in: clustered index */
- const ulint* offsets);/*!< in: rec_get_offsets(rec, index) */
+ const dict_index_t* index, /*!< in: clustered index */
+ const ulint* offsets)/*!< in: record offsets */
+ __attribute__((nonnull, warn_unused_result));
/*********************************************************************//**
Reads the trx id field from a clustered index record.
@return value of the field */
@@ -55,9 +55,10 @@ UNIV_INLINE
trx_id_t
row_get_rec_trx_id(
/*===============*/
- const rec_t* rec, /*!< in: record */
- dict_index_t* index, /*!< in: clustered index */
- const ulint* offsets);/*!< in: rec_get_offsets(rec, index) */
+ const rec_t* rec, /*!< in: record */
+ const dict_index_t* index, /*!< in: clustered index */
+ const ulint* offsets)/*!< in: rec_get_offsets(rec, index) */
+ __attribute__((nonnull, warn_unused_result));
/*********************************************************************//**
Reads the roll pointer field from a clustered index record.
@return value of the field */
@@ -65,9 +66,10 @@ UNIV_INLINE
roll_ptr_t
row_get_rec_roll_ptr(
/*=================*/
- const rec_t* rec, /*!< in: record */
- dict_index_t* index, /*!< in: clustered index */
- const ulint* offsets);/*!< in: rec_get_offsets(rec, index) */
+ const rec_t* rec, /*!< in: record */
+ const dict_index_t* index, /*!< in: clustered index */
+ const ulint* offsets)/*!< in: rec_get_offsets(rec, index) */
+ __attribute__((nonnull, warn_unused_result));
/*****************************************************************//**
When an insert or purge to a table is performed, this function builds
the entry to be inserted into or purged from an index on the table.
diff --git a/storage/xtradb/include/row0row.ic b/storage/xtradb/include/row0row.ic
index 05c007641af..0b9ca982af8 100644
--- a/storage/xtradb/include/row0row.ic
+++ b/storage/xtradb/include/row0row.ic
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -28,15 +28,42 @@ Created 4/20/1996 Heikki Tuuri
#include "trx0undo.h"
/*********************************************************************//**
+Gets the offset of the DB_TRX_ID field, in bytes relative to the origin of
+a clustered index record.
+@return offset of DATA_TRX_ID */
+UNIV_INLINE
+ulint
+row_get_trx_id_offset(
+/*==================*/
+ const dict_index_t* index, /*!< in: clustered index */
+ const ulint* offsets)/*!< in: record offsets */
+{
+ ulint pos;
+ ulint offset;
+ ulint len;
+
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(rec_offs_validate(NULL, index, offsets));
+
+ pos = dict_index_get_sys_col_pos(index, DATA_TRX_ID);
+
+ offset = rec_get_nth_field_offs(offsets, pos, &len);
+
+ ut_ad(len == DATA_TRX_ID_LEN);
+
+ return(offset);
+}
+
+/*********************************************************************//**
Reads the trx id field from a clustered index record.
@return value of the field */
UNIV_INLINE
trx_id_t
row_get_rec_trx_id(
/*===============*/
- const rec_t* rec, /*!< in: record */
- dict_index_t* index, /*!< in: clustered index */
- const ulint* offsets)/*!< in: rec_get_offsets(rec, index) */
+ const rec_t* rec, /*!< in: record */
+ const dict_index_t* index, /*!< in: clustered index */
+ const ulint* offsets)/*!< in: rec_get_offsets(rec, index) */
{
ulint offset;
@@ -46,7 +73,7 @@ row_get_rec_trx_id(
offset = index->trx_id_offset;
if (!offset) {
- offset = row_get_trx_id_offset(rec, index, offsets);
+ offset = row_get_trx_id_offset(index, offsets);
}
return(trx_read_trx_id(rec + offset));
@@ -59,9 +86,9 @@ UNIV_INLINE
roll_ptr_t
row_get_rec_roll_ptr(
/*=================*/
- const rec_t* rec, /*!< in: record */
- dict_index_t* index, /*!< in: clustered index */
- const ulint* offsets)/*!< in: rec_get_offsets(rec, index) */
+ const rec_t* rec, /*!< in: record */
+ const dict_index_t* index, /*!< in: clustered index */
+ const ulint* offsets)/*!< in: rec_get_offsets(rec, index) */
{
ulint offset;
@@ -71,7 +98,7 @@ row_get_rec_roll_ptr(
offset = index->trx_id_offset;
if (!offset) {
- offset = row_get_trx_id_offset(rec, index, offsets);
+ offset = row_get_trx_id_offset(index, offsets);
}
return(trx_read_roll_ptr(rec + offset + DATA_TRX_ID_LEN));
diff --git a/storage/xtradb/include/row0upd.ic b/storage/xtradb/include/row0upd.ic
index 18e22f1eca9..0894ed373b0 100644
--- a/storage/xtradb/include/row0upd.ic
+++ b/storage/xtradb/include/row0upd.ic
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -171,7 +171,7 @@ row_upd_rec_sys_fields(
ulint offset = index->trx_id_offset;
if (!offset) {
- offset = row_get_trx_id_offset(rec, index, offsets);
+ offset = row_get_trx_id_offset(index, offsets);
}
#if DATA_TRX_ID + 1 != DATA_ROLL_PTR
diff --git a/storage/xtradb/include/srv0srv.h b/storage/xtradb/include/srv0srv.h
index 29d88331532..49be9a2ec3a 100644
--- a/storage/xtradb/include/srv0srv.h
+++ b/storage/xtradb/include/srv0srv.h
@@ -161,13 +161,10 @@ extern ulint srv_buf_pool_curr_size; /*!< current size in bytes */
extern ulint srv_mem_pool_size;
extern ulint srv_lock_table_size;
-extern uint srv_buffer_pool_shm_key;
-extern ibool srv_buffer_pool_shm_is_reused;
-extern ibool srv_buffer_pool_shm_checksum;
-
extern ibool srv_thread_concurrency_timer_based;
extern ulint srv_n_file_io_threads;
+extern my_bool srv_random_read_ahead;
extern ulong srv_read_ahead_threshold;
extern ulint srv_n_read_io_threads;
extern ulint srv_n_write_io_threads;
@@ -290,6 +287,7 @@ extern ibool srv_print_latch_waits;
extern ulint srv_activity_count;
extern ulint srv_fatal_semaphore_wait_threshold;
extern ulint srv_dml_needed_delay;
+extern long long srv_kill_idle_transaction;
extern mutex_t* kernel_mutex_temp;/* mutex protecting the server, trx structs,
query threads, and lock table: we allocate
@@ -353,6 +351,9 @@ extern ulint srv_buf_pool_reads;
/** Time in seconds between automatic buffer pool dumps */
extern uint srv_auto_lru_dump;
+/** Whether startup should be blocked until buffer pool is fully restored */
+extern ibool srv_blocking_lru_restore;
+
/** Status variables to be passed to MySQL */
typedef struct export_var_struct export_struc;
@@ -696,6 +697,7 @@ struct export_var_struct{
ulint innodb_buffer_pool_wait_free; /*!< srv_buf_pool_wait_free */
ulint innodb_buffer_pool_pages_flushed; /*!< srv_buf_pool_flushed */
ulint innodb_buffer_pool_write_requests;/*!< srv_buf_pool_write_requests */
+ ulint innodb_buffer_pool_read_ahead_rnd;/*!< srv_read_ahead_rnd */
ulint innodb_buffer_pool_read_ahead; /*!< srv_read_ahead */
ulint innodb_buffer_pool_read_ahead_evicted;/*!< srv_read_ahead evicted*/
ulint innodb_deadlocks; /* ??? */
diff --git a/storage/xtradb/include/sync0arr.h b/storage/xtradb/include/sync0arr.h
index 5f1280f5e28..6e931346238 100644
--- a/storage/xtradb/include/sync0arr.h
+++ b/storage/xtradb/include/sync0arr.h
@@ -115,8 +115,11 @@ Prints warnings of long semaphore waits to stderr.
@return TRUE if fatal semaphore wait threshold was exceeded */
UNIV_INTERN
ibool
-sync_array_print_long_waits(void);
-/*=============================*/
+sync_array_print_long_waits(
+/*========================*/
+ os_thread_id_t* waiter, /*!< out: longest waiting thread */
+ const void** sema) /*!< out: longest-waited-for semaphore */
+ __attribute__((nonnull));
/********************************************************************//**
Validates the integrity of the wait array. Checks
that the number of reserved cells equals the count variable. */
diff --git a/storage/xtradb/include/sync0rw.ic b/storage/xtradb/include/sync0rw.ic
index 7116f1b7c9b..485a63a1b18 100644
--- a/storage/xtradb/include/sync0rw.ic
+++ b/storage/xtradb/include/sync0rw.ic
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2008, Google Inc.
Portions of this file contain modifications contributed and copyrighted by
@@ -603,16 +603,16 @@ rw_lock_x_unlock_direct(
ut_ad((lock->lock_word % X_LOCK_DECR) == 0);
-#ifdef UNIV_SYNC_DEBUG
- rw_lock_remove_debug_info(lock, 0, RW_LOCK_EX);
-#endif
-
if (lock->lock_word == 0) {
lock->recursive = FALSE;
UNIV_MEM_INVALID(&lock->writer_thread,
sizeof lock->writer_thread);
}
+#ifdef UNIV_SYNC_DEBUG
+ rw_lock_remove_debug_info(lock, 0, RW_LOCK_EX);
+#endif
+
lock->lock_word += X_LOCK_DECR;
ut_ad(!lock->waiters);
diff --git a/storage/xtradb/include/sync0sync.h b/storage/xtradb/include/sync0sync.h
index 6aaab1cc7d7..90def8efa38 100644
--- a/storage/xtradb/include/sync0sync.h
+++ b/storage/xtradb/include/sync0sync.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2008, Google Inc.
Portions of this file contain modifications contributed and copyrighted by
@@ -219,8 +219,10 @@ void
sync_thread_add_level(
/*==================*/
void* latch, /*!< in: pointer to a mutex or an rw-lock */
- ulint level); /*!< in: level in the latching order; if
+ ulint level, /*!< in: level in the latching order; if
SYNC_LEVEL_VARYING, nothing is done */
+ ibool relock) /*!< in: TRUE if re-entering an x-lock */
+ __attribute__((nonnull));
/******************************************************************//**
Removes a latch from the thread level array if it is found there.
@return TRUE if found in the array; it is no error if the latch is
diff --git a/storage/xtradb/include/trx0sys.h b/storage/xtradb/include/trx0sys.h
index 2637189f37e..eafa1ab6409 100644
--- a/storage/xtradb/include/trx0sys.h
+++ b/storage/xtradb/include/trx0sys.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -317,6 +317,17 @@ ibool
trx_in_trx_list(
/*============*/
trx_t* in_trx);/*!< in: trx */
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+/***********************************************************//**
+Assert that a transaction has been recovered.
+@return TRUE */
+UNIV_INLINE
+ibool
+trx_assert_recovered(
+/*=================*/
+ trx_id_t trx_id) /*!< in: transaction identifier */
+ __attribute__((warn_unused_result));
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
/*****************************************************************//**
Updates the offset information about the end of the MySQL binlog entry
which corresponds to the transaction just being committed. In a MySQL
diff --git a/storage/xtradb/include/trx0sys.ic b/storage/xtradb/include/trx0sys.ic
index 5e0f07c8b9d..234fc0b92e9 100644
--- a/storage/xtradb/include/trx0sys.ic
+++ b/storage/xtradb/include/trx0sys.ic
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -311,6 +311,28 @@ trx_get_on_id(
return(NULL);
}
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+/***********************************************************//**
+Assert that a transaction has been recovered.
+@return TRUE */
+UNIV_INLINE
+ibool
+trx_assert_recovered(
+/*=================*/
+ trx_id_t trx_id) /*!< in: transaction identifier */
+{
+ trx_t* trx;
+
+ mutex_enter(&kernel_mutex);
+ trx = trx_get_on_id(trx_id);
+ ut_a(trx);
+ ut_a(trx->is_recovered);
+ mutex_exit(&kernel_mutex);
+
+ return(TRUE);
+}
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
+
/****************************************************************//**
Returns the minumum trx id in trx list. This is the smallest id for which
the trx can possibly be active. (But, you must look at the trx->conc_state to
diff --git a/storage/xtradb/include/trx0trx.h b/storage/xtradb/include/trx0trx.h
index 8858fe2fafa..6a9fc324ff9 100644
--- a/storage/xtradb/include/trx0trx.h
+++ b/storage/xtradb/include/trx0trx.h
@@ -44,6 +44,9 @@ extern sess_t* trx_dummy_sess;
/** Number of transactions currently allocated for MySQL: protected by
the kernel mutex */
extern ulint trx_n_mysql_transactions;
+/** Number of transactions currently in the XA PREPARED state: protected by
+the kernel mutex */
+extern ulint trx_n_prepared;
/********************************************************************//**
Releases the search latch if trx has reserved it. */
@@ -108,6 +111,14 @@ trx_free(
/*=====*/
trx_t* trx); /*!< in, own: trx object */
/********************************************************************//**
+At shutdown, frees a transaction object that is in the PREPARED state. */
+UNIV_INTERN
+void
+trx_free_prepared(
+/*==============*/
+ trx_t* trx) /*!< in, own: trx object */
+ __attribute__((nonnull));
+/********************************************************************//**
Frees a transaction object for MySQL. */
UNIV_INTERN
void
@@ -498,6 +509,7 @@ struct trx_struct{
150 bytes in the undo log size as then
we skip XA steps */
ulint flush_log_at_trx_commit_session;
+ ulint fake_changes;
ulint flush_log_later;/* In 2PC, we hold the
prepare_commit mutex across
both phases. In that case, we
@@ -590,6 +602,8 @@ struct trx_struct{
ulint mysql_process_no;/* since in Linux, 'top' reports
process id's and not thread id's, we
store the process number too */
+ time_t idle_start;
+ ib_int64_t last_stmt_start;
/*------------------------------*/
ulint n_mysql_tables_in_use; /* number of Innobase tables
used in the processing of the current
diff --git a/storage/xtradb/include/trx0undo.h b/storage/xtradb/include/trx0undo.h
index a084f2394b5..4f15cd85833 100644
--- a/storage/xtradb/include/trx0undo.h
+++ b/storage/xtradb/include/trx0undo.h
@@ -298,6 +298,15 @@ void
trx_undo_insert_cleanup(
/*====================*/
trx_t* trx); /*!< in: transaction handle */
+
+/********************************************************************//**
+At shutdown, frees the undo logs of a PREPARED transaction. */
+UNIV_INTERN
+void
+trx_undo_free_prepared(
+/*===================*/
+ trx_t* trx) /*!< in/out: PREPARED transaction */
+ __attribute__((nonnull));
#endif /* !UNIV_HOTBACKUP */
/***********************************************************//**
Parses the redo log entry of an undo log page initialization.
diff --git a/storage/xtradb/include/univ.i b/storage/xtradb/include/univ.i
index 7b11a16dae9..902d0c94ddd 100644
--- a/storage/xtradb/include/univ.i
+++ b/storage/xtradb/include/univ.i
@@ -1,8 +1,7 @@
/*****************************************************************************
-Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2008, Google Inc.
-Copyright (c) 2009, Sun Microsystems, Inc.
Portions of this file contain modifications contributed and copyrighted by
Google, Inc. Those modifications are gratefully acknowledged and are described
@@ -46,8 +45,8 @@ Created 1/20/1994 Heikki Tuuri
#define INNODB_VERSION_MAJOR 1
#define INNODB_VERSION_MINOR 0
-#define INNODB_VERSION_BUGFIX 15
-#define PERCONA_INNODB_VERSION 12.7
+#define INNODB_VERSION_BUGFIX 17
+#define PERCONA_INNODB_VERSION 13.0
/* The following is the InnoDB version as shown in
SELECT plugin_version FROM information_schema.plugins;
@@ -197,6 +196,8 @@ this will break redo log file compatibility, but it may be useful when
debugging redo log application problems. */
#define UNIV_MEM_DEBUG /* detect memory leaks etc */
#define UNIV_IBUF_DEBUG /* debug the insert buffer */
+#define UNIV_BLOB_DEBUG /* track BLOB ownership;
+assumes that no BLOBs survive server restart */
#define UNIV_IBUF_COUNT_DEBUG /* debug the insert buffer;
this limits the database to IBUF_COUNT_N_SPACES and IBUF_COUNT_N_PAGES,
and the insert buffer must be empty when the database is started */
diff --git a/storage/xtradb/include/ut0lst.h b/storage/xtradb/include/ut0lst.h
index 245dfc226c3..7b15c052978 100644
--- a/storage/xtradb/include/ut0lst.h
+++ b/storage/xtradb/include/ut0lst.h
@@ -257,48 +257,5 @@ do { \
ut_a(ut_list_node_313 == NULL); \
} while (0)
-/********************************************************************//**
-Align nodes with moving location.
-@param NAME the name of the list
-@param TYPE node type
-@param BASE base node (not a pointer to it)
-@param OFFSET offset moved */
-#define UT_LIST_OFFSET(NAME, TYPE, BASE, FADDR, FOFFSET, BOFFSET) \
-do { \
- ulint ut_list_i_313; \
- TYPE* ut_list_node_313; \
- \
- if ((BASE).start) \
- (BASE).start = (void*)((byte*)((BASE).start) \
- + (((void*)((BASE).start) > (void*)FADDR)?FOFFSET:BOFFSET));\
- if ((BASE).end) \
- (BASE).end = (void*)((byte*)((BASE).end) \
- + (((void*)((BASE).end) > (void*)FADDR)?FOFFSET:BOFFSET));\
- \
- ut_list_node_313 = (BASE).start; \
- \
- for (ut_list_i_313 = (BASE).count; ut_list_i_313--; ) { \
- ut_a(ut_list_node_313); \
- if ((ut_list_node_313->NAME).prev) \
- (ut_list_node_313->NAME).prev = (void*)((byte*)((ut_list_node_313->NAME).prev)\
- + (((void*)((ut_list_node_313->NAME).prev) > (void*)FADDR)?FOFFSET:BOFFSET));\
- if ((ut_list_node_313->NAME).next) \
- (ut_list_node_313->NAME).next = (void*)((byte*)((ut_list_node_313->NAME).next)\
- + (((void*)((ut_list_node_313->NAME).next)> (void*)FADDR)?FOFFSET:BOFFSET));\
- ut_list_node_313 = (ut_list_node_313->NAME).next; \
- } \
- \
- ut_a(ut_list_node_313 == NULL); \
- \
- ut_list_node_313 = (BASE).end; \
- \
- for (ut_list_i_313 = (BASE).count; ut_list_i_313--; ) { \
- ut_a(ut_list_node_313); \
- ut_list_node_313 = (ut_list_node_313->NAME).prev; \
- } \
- \
- ut_a(ut_list_node_313 == NULL); \
-} while (0)
-
#endif
diff --git a/storage/xtradb/include/ut0mem.h b/storage/xtradb/include/ut0mem.h
index f14606be966..9c6ee9049ec 100644
--- a/storage/xtradb/include/ut0mem.h
+++ b/storage/xtradb/include/ut0mem.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -210,43 +210,6 @@ ut_strlcpy_rev(
ulint size); /*!< in: size of destination buffer */
/**********************************************************************//**
-Compute strlen(ut_strcpyq(str, q)).
-@return length of the string when quoted */
-UNIV_INLINE
-ulint
-ut_strlenq(
-/*=======*/
- const char* str, /*!< in: null-terminated string */
- char q); /*!< in: the quote character */
-
-/**********************************************************************//**
-Make a quoted copy of a NUL-terminated string. Leading and trailing
-quotes will not be included; only embedded quotes will be escaped.
-See also ut_strlenq() and ut_memcpyq().
-@return pointer to end of dest */
-UNIV_INTERN
-char*
-ut_strcpyq(
-/*=======*/
- char* dest, /*!< in: output buffer */
- char q, /*!< in: the quote character */
- const char* src); /*!< in: null-terminated string */
-
-/**********************************************************************//**
-Make a quoted copy of a fixed-length string. Leading and trailing
-quotes will not be included; only embedded quotes will be escaped.
-See also ut_strlenq() and ut_strcpyq().
-@return pointer to end of dest */
-UNIV_INTERN
-char*
-ut_memcpyq(
-/*=======*/
- char* dest, /*!< in: output buffer */
- char q, /*!< in: the quote character */
- const char* src, /*!< in: string to be quoted */
- ulint len); /*!< in: length of src */
-
-/**********************************************************************//**
Return the number of times s2 occurs in s1. Overlapping instances of s2
are only counted once.
@return the number of times s2 occurs in s1 */
diff --git a/storage/xtradb/include/ut0mem.ic b/storage/xtradb/include/ut0mem.ic
index f36c28f1989..c06e2b3ae81 100644
--- a/storage/xtradb/include/ut0mem.ic
+++ b/storage/xtradb/include/ut0mem.ic
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -99,27 +99,6 @@ ut_strcmp(const char* str1, const char* str2)
}
/**********************************************************************//**
-Compute strlen(ut_strcpyq(str, q)).
-@return length of the string when quoted */
-UNIV_INLINE
-ulint
-ut_strlenq(
-/*=======*/
- const char* str, /*!< in: null-terminated string */
- char q) /*!< in: the quote character */
-{
- ulint len;
-
- for (len = 0; *str; len++, str++) {
- if (*str == q) {
- len++;
- }
- }
-
- return(len);
-}
-
-/**********************************************************************//**
Converts a raw binary data to a NUL-terminated hex string. The output is
truncated if there is not enough space in "hex", make sure "hex_size" is at
least (2 * raw_size + 1) if you do not want this to happen. Returns the
diff --git a/storage/xtradb/lock/lock0lock.c b/storage/xtradb/lock/lock0lock.c
index 4fcb5b2c522..e5da4f46ec9 100644
--- a/storage/xtradb/lock/lock0lock.c
+++ b/storage/xtradb/lock/lock0lock.c
@@ -3908,6 +3908,10 @@ lock_table(
trx = thr_get_trx(thr);
+ if (trx->fake_changes && mode == LOCK_IX) {
+ mode = LOCK_IS;
+ }
+
lock_mutex_enter_kernel();
/* Look for stronger locks the same trx already has on the table */
@@ -5109,6 +5113,11 @@ lock_rec_insert_check_and_lock(
}
trx = thr_get_trx(thr);
+
+ if (trx->fake_changes) {
+ return(DB_SUCCESS);
+ }
+
next_rec = page_rec_get_next_const(rec);
next_rec_heap_no = page_rec_get_heap_no(next_rec);
@@ -5277,6 +5286,10 @@ lock_clust_rec_modify_check_and_lock(
return(DB_SUCCESS);
}
+ if (thr && thr_get_trx(thr)->fake_changes) {
+ return(DB_SUCCESS);
+ }
+
heap_no = rec_offs_comp(offsets)
? rec_get_heap_no_new(rec)
: rec_get_heap_no_old(rec);
@@ -5335,6 +5348,10 @@ lock_sec_rec_modify_check_and_lock(
return(DB_SUCCESS);
}
+ if (thr && thr_get_trx(thr)->fake_changes) {
+ return(DB_SUCCESS);
+ }
+
heap_no = page_rec_get_heap_no(rec);
/* Another transaction cannot have an implicit lock on the record,
@@ -5422,6 +5439,10 @@ lock_sec_rec_read_check_and_lock(
return(DB_SUCCESS);
}
+ if (thr && thr_get_trx(thr)->fake_changes && mode == LOCK_X) {
+ mode = LOCK_S;
+ }
+
heap_no = page_rec_get_heap_no(rec);
lock_mutex_enter_kernel();
@@ -5499,6 +5520,10 @@ lock_clust_rec_read_check_and_lock(
return(DB_SUCCESS);
}
+ if (thr && thr_get_trx(thr)->fake_changes && mode == LOCK_X) {
+ mode = LOCK_S;
+ }
+
heap_no = page_rec_get_heap_no(rec);
lock_mutex_enter_kernel();
diff --git a/storage/xtradb/log/log0log.c b/storage/xtradb/log/log0log.c
index 82c6c3da23e..7d394504f46 100644
--- a/storage/xtradb/log/log0log.c
+++ b/storage/xtradb/log/log0log.c
@@ -1116,7 +1116,7 @@ log_io_complete(
&& srv_unix_file_flush_method != SRV_UNIX_ALL_O_DIRECT
&& srv_unix_file_flush_method != SRV_UNIX_NOSYNC) {
- fil_flush(group->space_id);
+ fil_flush(group->space_id, FALSE);
}
#ifdef UNIV_DEBUG
@@ -1139,7 +1139,7 @@ log_io_complete(
&& srv_unix_file_flush_method != SRV_UNIX_NOSYNC
&& srv_flush_log_at_trx_commit != 2) {
- fil_flush(group->space_id);
+ fil_flush(group->space_id, FALSE);
}
mutex_enter(&(log_sys->mutex));
@@ -1530,7 +1530,7 @@ loop:
group = UT_LIST_GET_FIRST(log_sys->log_groups);
- fil_flush(group->space_id);
+ fil_flush(group->space_id, FALSE);
log_sys->flushed_to_disk_lsn = log_sys->write_lsn;
}
@@ -2706,7 +2706,7 @@ log_io_complete_archive(void)
mutex_exit(&(log_sys->mutex));
- fil_flush(group->archive_space_id);
+ fil_flush(group->archive_space_id, TRUE);
mutex_enter(&(log_sys->mutex));
@@ -3209,12 +3209,13 @@ loop:
goto loop;
}
- /* Check that there are no longer transactions. We need this wait even
- for the 'very fast' shutdown, because the InnoDB layer may have
- committed or prepared transactions and we don't want to lose them. */
+ /* Check that there are no longer transactions, except for
+ PREPARED ones. We need this wait even for the 'very fast'
+ shutdown, because the InnoDB layer may have committed or
+ prepared transactions and we don't want to lose them. */
if (trx_n_mysql_transactions > 0
- || UT_LIST_GET_LEN(trx_sys->trx_list) > 0) {
+ || UT_LIST_GET_LEN(trx_sys->trx_list) > trx_n_prepared) {
mutex_exit(&kernel_mutex);
diff --git a/storage/xtradb/log/log0recv.c b/storage/xtradb/log/log0recv.c
index 895067c700a..fae7fbd0da0 100644
--- a/storage/xtradb/log/log0recv.c
+++ b/storage/xtradb/log/log0recv.c
@@ -2899,7 +2899,6 @@ recv_init_crash_recovery(void)
/*==========================*/
{
ut_a(!recv_needed_recovery);
- ut_a(!srv_buffer_pool_shm_is_reused);
recv_needed_recovery = TRUE;
@@ -3622,7 +3621,7 @@ recv_reset_log_files_for_backup(
exit(1);
}
- os_file_flush(log_file);
+ os_file_flush(log_file, TRUE);
os_file_close(log_file);
}
@@ -3645,7 +3644,7 @@ recv_reset_log_files_for_backup(
os_file_write(name, log_file, buf, 0, 0,
LOG_FILE_HDR_SIZE + OS_FILE_LOG_BLOCK_SIZE);
- os_file_flush(log_file);
+ os_file_flush(log_file, TRUE);
os_file_close(log_file);
ut_free(buf);
diff --git a/storage/xtradb/mtr/mtr0mtr.c b/storage/xtradb/mtr/mtr0mtr.c
index 34e6d3ffc92..a9f1c35f84c 100644
--- a/storage/xtradb/mtr/mtr0mtr.c
+++ b/storage/xtradb/mtr/mtr0mtr.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -248,40 +248,6 @@ mtr_commit(
}
#ifndef UNIV_HOTBACKUP
-/**********************************************************//**
-Releases the latches stored in an mtr memo down to a savepoint.
-NOTE! The mtr must not have made changes to buffer pages after the
-savepoint, as these can be handled only by mtr_commit. */
-UNIV_INTERN
-void
-mtr_rollback_to_savepoint(
-/*======================*/
- mtr_t* mtr, /*!< in: mtr */
- ulint savepoint) /*!< in: savepoint */
-{
- mtr_memo_slot_t* slot;
- dyn_array_t* memo;
- ulint offset;
-
- ut_ad(mtr);
- ut_ad(mtr->magic_n == MTR_MAGIC_N);
- ut_ad(mtr->state == MTR_ACTIVE);
-
- memo = &(mtr->memo);
-
- offset = dyn_array_get_data_size(memo);
- ut_ad(offset >= savepoint);
-
- while (offset > savepoint) {
- offset -= sizeof(mtr_memo_slot_t);
-
- slot = dyn_array_get_element(memo, offset);
-
- ut_ad(slot->type != MTR_MEMO_MODIFY);
- mtr_memo_slot_release(mtr, slot);
- }
-}
-
/***************************************************//**
Releases an object in the memo stack. */
UNIV_INTERN
diff --git a/storage/xtradb/os/os0file.c b/storage/xtradb/os/os0file.c
index ee164bd6317..ef20869c4c5 100644
--- a/storage/xtradb/os/os0file.c
+++ b/storage/xtradb/os/os0file.c
@@ -2017,7 +2017,7 @@ os_file_set_size(
ut_free(buf2);
- ret = os_file_flush(file);
+ ret = os_file_flush(file, TRUE);
if (ret) {
return(TRUE);
@@ -2055,7 +2055,8 @@ static
int
os_file_fsync(
/*==========*/
- os_file_t file) /*!< in: handle to a file */
+ os_file_t file, /*!< in: handle to a file */
+ ibool metadata)
{
int ret;
int failures;
@@ -2064,7 +2065,16 @@ os_file_fsync(
failures = 0;
do {
+#if defined(HAVE_FDATASYNC) && HAVE_DECL_FDATASYNC
+ if (metadata) {
+ ret = fsync(file);
+ } else {
+ ret = fdatasync(file);
+ }
+#else
+ (void) metadata;
ret = fsync(file);
+#endif
os_n_fsyncs++;
@@ -2083,6 +2093,9 @@ os_file_fsync(
failures++;
retry = TRUE;
+ } else if (ret == -1 && errno == EINTR) {
+ /* Handle signal interruptions correctly */
+ retry = TRUE;
} else {
retry = FALSE;
@@ -2100,7 +2113,8 @@ UNIV_INTERN
ibool
os_file_flush(
/*==========*/
- os_file_t file) /*!< in, own: handle to a file */
+ os_file_t file, /*!< in, own: handle to a file */
+ ibool metadata)
{
#ifdef __WIN__
BOOL ret;
@@ -2150,18 +2164,18 @@ os_file_flush(
/* If we are not on an operating system that supports this,
then fall back to a plain fsync. */
- ret = os_file_fsync(file);
+ ret = os_file_fsync(file, metadata);
} else {
ret = fcntl(file, F_FULLFSYNC, NULL);
if (ret) {
/* If we are not on a file system that supports this,
then fall back to a plain fsync. */
- ret = os_file_fsync(file);
+ ret = os_file_fsync(file, metadata);
}
}
#else
- ret = os_file_fsync(file);
+ ret = os_file_fsync(file, metadata);
#endif
if (ret == 0) {
@@ -2214,6 +2228,7 @@ _os_file_pread(
off_t offs;
#if defined(HAVE_PREAD) && !defined(HAVE_BROKEN_PREAD)
ssize_t n_bytes;
+ ssize_t n_read;
#endif /* HAVE_PREAD && !HAVE_BROKEN_PREAD */
ulint sec;
ulint ms;
@@ -2254,7 +2269,18 @@ _os_file_pread(
os_n_pending_reads++;
os_mutex_exit(os_file_count_mutex);
- n_bytes = pread(file, buf, (ssize_t)n, offs);
+ /* Handle signal interruptions correctly */
+ for (n_bytes = 0; n_bytes < (ssize_t) n; ) {
+ n_read = pread(file, buf, (ssize_t)n, offs);
+ if (n_read > 0) {
+ n_bytes += n_read;
+ offs += n_read;
+ } else if (n_read == -1 && errno == EINTR) {
+ continue;
+ } else {
+ break;
+ }
+ }
os_mutex_enter(os_file_count_mutex);
os_file_n_pending_preads--;
@@ -2273,6 +2299,7 @@ _os_file_pread(
{
off_t ret_offset;
ssize_t ret;
+ ssize_t n_read;
#ifndef UNIV_HOTBACKUP
ulint i;
#endif /* !UNIV_HOTBACKUP */
@@ -2293,7 +2320,17 @@ _os_file_pread(
if (ret_offset < 0) {
ret = -1;
} else {
- ret = read(file, buf, (ssize_t)n);
+ /* Handle signal interruptions correctly */
+ for (ret = 0; ret < (ssize_t) n; ) {
+ n_read = read(file, buf, (ssize_t)n);
+ if (n_read > 0) {
+ ret += n_read;
+ } else if (n_read == -1 && errno == EINTR) {
+ continue;
+ } else {
+ break;
+ }
+ }
}
#ifndef UNIV_HOTBACKUP
@@ -2332,6 +2369,7 @@ os_file_pwrite(
offset */
{
ssize_t ret;
+ ssize_t n_written;
off_t offs;
ut_a((offset & 0xFFFFFFFFUL) == offset);
@@ -2359,7 +2397,18 @@ os_file_pwrite(
os_n_pending_writes++;
os_mutex_exit(os_file_count_mutex);
- ret = pwrite(file, buf, (ssize_t)n, offs);
+ /* Handle signal interruptions correctly */
+ for (ret = 0; ret < (ssize_t) n; ) {
+ n_written = pwrite(file, buf, (ssize_t)n, offs);
+ if (n_written > 0) {
+ ret += n_written;
+ offs += n_written;
+ } else if (n_written == -1 && errno == EINTR) {
+ continue;
+ } else {
+ break;
+ }
+ }
os_mutex_enter(os_file_count_mutex);
os_file_n_pending_pwrites--;
@@ -2375,7 +2424,7 @@ os_file_pwrite(
the OS crashes, a database page is only partially
physically written to disk. */
- ut_a(TRUE == os_file_flush(file));
+ ut_a(TRUE == os_file_flush(file, TRUE));
}
# endif /* UNIV_DO_FLUSH */
@@ -2406,7 +2455,17 @@ os_file_pwrite(
goto func_exit;
}
- ret = write(file, buf, (ssize_t)n);
+ /* Handle signal interruptions correctly */
+ for (ret = 0; ret < (ssize_t) n; ) {
+ n_written = write(file, buf, (ssize_t)n);
+ if (n_written > 0) {
+ ret += n_written;
+ } else if (n_written == -1 && errno == EINTR) {
+ continue;
+ } else {
+ break;
+ }
+ }
# ifdef UNIV_DO_FLUSH
if (srv_unix_file_flush_method != SRV_UNIX_LITTLESYNC
@@ -2417,7 +2476,7 @@ os_file_pwrite(
the OS crashes, a database page is only partially
physically written to disk. */
- ut_a(TRUE == os_file_flush(file));
+ ut_a(TRUE == os_file_flush(file, TRUE));
}
# endif /* UNIV_DO_FLUSH */
@@ -3866,7 +3925,7 @@ os_aio_windows_handle(
#ifdef UNIV_DO_FLUSH
if (slot->type == OS_FILE_WRITE
&& !os_do_not_call_flush_at_each_write) {
- ut_a(TRUE == os_file_flush(slot->file));
+ ut_a(TRUE == os_file_flush(slot->file, TRUE));
}
#endif /* UNIV_DO_FLUSH */
} else if (os_file_handle_error(slot->name, "Windows aio")) {
diff --git a/storage/xtradb/os/os0proc.c b/storage/xtradb/os/os0proc.c
index 4567d96b6f4..48922886f23 100644
--- a/storage/xtradb/os/os0proc.c
+++ b/storage/xtradb/os/os0proc.c
@@ -229,173 +229,3 @@ os_mem_free_large(
}
#endif
}
-
-/****************************************************************//**
-Allocates or attaches and reuses shared memory segment.
-The content is not cleared automatically.
-@return allocated memory */
-UNIV_INTERN
-void*
-os_shm_alloc(
-/*=========*/
- ulint* n, /*!< in/out: number of bytes */
- uint key,
- ibool* is_new)
-{
- void* ptr;
-#if defined HAVE_SYS_IPC_H && HAVE_SYS_SHM_H
- ulint size;
- int shmid;
-
- *is_new = FALSE;
- fprintf(stderr,
- "InnoDB: The shared memory segment containing the buffer pool is: key %#x (%d).\n",
- key, key);
-# if defined HAVE_LARGE_PAGES && defined UNIV_LINUX
- if (!os_use_large_pages || !os_large_page_size) {
- goto skip;
- }
-
- /* Align block size to os_large_page_size */
- ut_ad(ut_is_2pow(os_large_page_size));
- size = ut_2pow_round(*n + (os_large_page_size - 1),
- os_large_page_size);
-
- shmid = shmget((key_t)key, (size_t)size,
- IPC_CREAT | IPC_EXCL | SHM_HUGETLB | SHM_R | SHM_W);
- if (shmid < 0) {
- if (errno == EEXIST) {
- fprintf(stderr,
- "InnoDB: HugeTLB: The shared memory segment exists.\n");
- shmid = shmget((key_t)key, (size_t)size,
- SHM_HUGETLB | SHM_R | SHM_W);
- if (shmid < 0) {
- fprintf(stderr,
- "InnoDB: HugeTLB: Warning: Failed to allocate %lu bytes. (reuse) errno %d\n",
- size, errno);
- goto skip;
- } else {
- fprintf(stderr,
- "InnoDB: HugeTLB: The existent shared memory segment is used.\n");
- }
- } else {
- fprintf(stderr,
- "InnoDB: HugeTLB: Warning: Failed to allocate %lu bytes. (new) errno %d\n",
- size, errno);
- goto skip;
- }
- } else {
- *is_new = TRUE;
- fprintf(stderr,
- "InnoDB: HugeTLB: A new shared memory segment has been created .\n");
- }
-
- ptr = shmat(shmid, NULL, 0);
- if (ptr == (void *)-1) {
- fprintf(stderr,
- "InnoDB: HugeTLB: Warning: Failed to attach shared memory segment, errno %d\n",
- errno);
- ptr = NULL;
- }
-
- if (ptr) {
- *n = size;
- os_fast_mutex_lock(&ut_list_mutex);
- ut_total_allocated_memory += size;
- os_fast_mutex_unlock(&ut_list_mutex);
- UNIV_MEM_ALLOC(ptr, size);
- return(ptr);
- }
-skip:
- *is_new = FALSE;
-# endif /* HAVE_LARGE_PAGES && defined UNIV_LINUX */
-# ifdef HAVE_GETPAGESIZE
- size = getpagesize();
-# else
- size = UNIV_PAGE_SIZE;
-# endif
- /* Align block size to system page size */
- ut_ad(ut_is_2pow(size));
- size = *n = ut_2pow_round(*n + (size - 1), size);
-
- shmid = shmget((key_t)key, (size_t)size,
- IPC_CREAT | IPC_EXCL | SHM_R | SHM_W);
- if (shmid < 0) {
- if (errno == EEXIST) {
- fprintf(stderr,
- "InnoDB: A shared memory segment containing the buffer pool seems to already exist.\n");
- shmid = shmget((key_t)key, (size_t)size,
- SHM_R | SHM_W);
- if (shmid < 0) {
- fprintf(stderr,
- "InnoDB: Warning: Failed to allocate %lu bytes. (reuse) errno %d\n",
- size, errno);
- ptr = NULL;
- goto end;
- } else {
- fprintf(stderr,
- "InnoDB: The existent shared memory segment is used.\n");
- }
- } else {
- fprintf(stderr,
- "InnoDB: Warning: Failed to allocate %lu bytes. (new) errno %d\n",
- size, errno);
- ptr = NULL;
- goto end;
- }
- } else {
- *is_new = TRUE;
- fprintf(stderr,
- "InnoDB: A new shared memory segment has been created.\n");
- }
-
- ptr = shmat(shmid, NULL, 0);
- if (ptr == (void *)-1) {
- fprintf(stderr,
- "InnoDB: Warning: Failed to attach shared memory segment, errno %d\n",
- errno);
- ptr = NULL;
- }
-
- if (ptr) {
- *n = size;
- os_fast_mutex_lock(&ut_list_mutex);
- ut_total_allocated_memory += size;
- os_fast_mutex_unlock(&ut_list_mutex);
- UNIV_MEM_ALLOC(ptr, size);
- }
-end:
-#else /* HAVE_SYS_IPC_H && HAVE_SYS_SHM_H */
- fprintf(stderr, "InnoDB: shared memory segment is not supported.\n");
- ptr = NULL;
-#endif /* HAVE_SYS_IPC_H && HAVE_SYS_SHM_H */
- return(ptr);
-}
-
-/****************************************************************//**
-Detach shared memory segment. */
-UNIV_INTERN
-void
-os_shm_free(
-/*========*/
- void *ptr, /*!< in: pointer returned by
- os_shm_alloc() */
- ulint size) /*!< in: size returned by
- os_shm_alloc() */
-{
- os_fast_mutex_lock(&ut_list_mutex);
- ut_a(ut_total_allocated_memory >= size);
- os_fast_mutex_unlock(&ut_list_mutex);
-
-#if defined HAVE_SYS_IPC_H && HAVE_SYS_SHM_H
- if (!shmdt(ptr)) {
- os_fast_mutex_lock(&ut_list_mutex);
- ut_a(ut_total_allocated_memory >= size);
- ut_total_allocated_memory -= size;
- os_fast_mutex_unlock(&ut_list_mutex);
- UNIV_MEM_FREE(ptr, size);
- }
-#else /* HAVE_SYS_IPC_H && HAVE_SYS_SHM_H */
- fprintf(stderr, "InnoDB: shared memory segment is not supported.\n");
-#endif /* HAVE_SYS_IPC_H && HAVE_SYS_SHM_H */
-}
diff --git a/storage/xtradb/page/page0cur.c b/storage/xtradb/page/page0cur.c
index f10f16a7dd9..b8c492328e8 100644
--- a/storage/xtradb/page/page0cur.c
+++ b/storage/xtradb/page/page0cur.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -1149,6 +1149,8 @@ use_heap:
current_rec, index, mtr);
}
+ btr_blob_dbg_add_rec(insert_rec, index, offsets, "insert");
+
return(insert_rec);
}
@@ -1178,14 +1180,15 @@ page_cur_insert_rec_zip_reorg(
/* Before trying to reorganize the page,
store the number of preceding records on the page. */
pos = page_rec_get_n_recs_before(rec);
+ ut_ad(pos > 0);
if (page_zip_reorganize(block, index, mtr)) {
/* The page was reorganized: Find rec by seeking to pos,
and update *current_rec. */
- rec = page + PAGE_NEW_INFIMUM;
-
- while (--pos) {
- rec = page + rec_get_next_offs(rec, TRUE);
+ if (pos > 1) {
+ rec = page_rec_get_nth(page, pos - 1);
+ } else {
+ rec = page + PAGE_NEW_INFIMUM;
}
*current_rec = rec;
@@ -1195,10 +1198,12 @@ page_cur_insert_rec_zip_reorg(
}
/* Out of space: restore the page */
+ btr_blob_dbg_remove(page, index, "insert_zip_fail");
if (!page_zip_decompress(page_zip, page, FALSE)) {
ut_error; /* Memory corrupted? */
}
ut_ad(page_validate(page, index));
+ btr_blob_dbg_add(page, index, "insert_zip_fail");
return(NULL);
}
@@ -1279,6 +1284,12 @@ page_cur_insert_rec_zip(
insert_rec = page_cur_insert_rec_zip_reorg(
current_rec, block, index, insert_rec,
page, page_zip, mtr);
+#ifdef UNIV_DEBUG
+ if (insert_rec) {
+ rec_offs_make_valid(
+ insert_rec, index, offsets);
+ }
+#endif /* UNIV_DEBUG */
}
return(insert_rec);
@@ -1490,6 +1501,8 @@ use_heap:
page_zip_write_rec(page_zip, insert_rec, index, offsets, 1);
+ btr_blob_dbg_add_rec(insert_rec, index, offsets, "insert_zip_ok");
+
/* 9. Write log record of the insert */
if (UNIV_LIKELY(mtr != NULL)) {
page_cur_insert_rec_write_log(insert_rec, rec_size,
@@ -1697,6 +1710,9 @@ page_copy_rec_list_end_to_created_page(
heap_top += rec_size;
+ rec_offs_make_valid(insert_rec, index, offsets);
+ btr_blob_dbg_add_rec(insert_rec, index, offsets, "copy_end");
+
page_cur_insert_rec_write_log(insert_rec, rec_size, prev_rec,
index, mtr);
prev_rec = insert_rec;
@@ -1944,6 +1960,7 @@ page_cur_delete_rec(
page_dir_slot_set_n_owned(cur_dir_slot, page_zip, cur_n_owned - 1);
/* 6. Free the memory occupied by the record */
+ btr_blob_dbg_remove_rec(current_rec, index, offsets, "delete");
page_mem_free(page, page_zip, current_rec, index, offsets);
/* 7. Now we have decremented the number of owned records of the slot.
diff --git a/storage/xtradb/page/page0page.c b/storage/xtradb/page/page0page.c
index 10008f9ac25..a284b1480a3 100644
--- a/storage/xtradb/page/page0page.c
+++ b/storage/xtradb/page/page0page.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -685,12 +685,16 @@ page_copy_rec_list_end(
if (UNIV_UNLIKELY
(!page_zip_reorganize(new_block, index, mtr))) {
+ btr_blob_dbg_remove(new_page, index,
+ "copy_end_reorg_fail");
if (UNIV_UNLIKELY
(!page_zip_decompress(new_page_zip,
new_page, FALSE))) {
ut_error;
}
ut_ad(page_validate(new_page, index));
+ btr_blob_dbg_add(new_page, index,
+ "copy_end_reorg_fail");
return(NULL);
} else {
/* The page was reorganized:
@@ -803,12 +807,16 @@ page_copy_rec_list_start(
if (UNIV_UNLIKELY
(!page_zip_reorganize(new_block, index, mtr))) {
+ btr_blob_dbg_remove(new_page, index,
+ "copy_start_reorg_fail");
if (UNIV_UNLIKELY
(!page_zip_decompress(new_page_zip,
new_page, FALSE))) {
ut_error;
}
ut_ad(page_validate(new_page, index));
+ btr_blob_dbg_add(new_page, index,
+ "copy_start_reorg_fail");
return(NULL);
} else {
/* The page was reorganized:
@@ -1080,6 +1088,9 @@ page_delete_rec_list_end(
/* Remove the record chain segment from the record chain */
page_rec_set_next(prev_rec, page_get_supremum_rec(page));
+ btr_blob_dbg_op(page, rec, index, "delete_end",
+ btr_blob_dbg_remove_rec);
+
/* Catenate the deleted chain segment to the page free list */
page_rec_set_next(last_rec, page_header_get_ptr(page, PAGE_FREE));
@@ -1476,55 +1487,54 @@ page_dir_balance_slot(
}
}
-#ifndef UNIV_HOTBACKUP
/************************************************************//**
-Returns the middle record of the record list. If there are an even number
-of records in the list, returns the first record of the upper half-list.
-@return middle record */
+Returns the nth record of the record list.
+This is the inverse function of page_rec_get_n_recs_before().
+@return nth record */
UNIV_INTERN
-rec_t*
-page_get_middle_rec(
-/*================*/
- page_t* page) /*!< in: page */
+const rec_t*
+page_rec_get_nth_const(
+/*===================*/
+ const page_t* page, /*!< in: page */
+ ulint nth) /*!< in: nth record */
{
- page_dir_slot_t* slot;
- ulint middle;
+ const page_dir_slot_t* slot;
ulint i;
ulint n_owned;
- ulint count;
- rec_t* rec;
+ const rec_t* rec;
- /* This many records we must leave behind */
- middle = (page_get_n_recs(page) + PAGE_HEAP_NO_USER_LOW) / 2;
-
- count = 0;
+ ut_ad(nth < UNIV_PAGE_SIZE / (REC_N_NEW_EXTRA_BYTES + 1));
for (i = 0;; i++) {
slot = page_dir_get_nth_slot(page, i);
n_owned = page_dir_slot_get_n_owned(slot);
- if (count + n_owned > middle) {
+ if (n_owned > nth) {
break;
} else {
- count += n_owned;
+ nth -= n_owned;
}
}
ut_ad(i > 0);
slot = page_dir_get_nth_slot(page, i - 1);
- rec = (rec_t*) page_dir_slot_get_rec(slot);
- rec = page_rec_get_next(rec);
-
- /* There are now count records behind rec */
+ rec = page_dir_slot_get_rec(slot);
- for (i = 0; i < middle - count; i++) {
- rec = page_rec_get_next(rec);
+ if (page_is_comp(page)) {
+ do {
+ rec = page_rec_get_next_low(rec, TRUE);
+ ut_ad(rec);
+ } while (nth--);
+ } else {
+ do {
+ rec = page_rec_get_next_low(rec, FALSE);
+ ut_ad(rec);
+ } while (nth--);
}
return(rec);
}
-#endif /* !UNIV_HOTBACKUP */
/***************************************************************//**
Returns the number of records before the given record in chain.
@@ -1586,6 +1596,7 @@ page_rec_get_n_recs_before(
n--;
ut_ad(n >= 0);
+ ut_ad(n < UNIV_PAGE_SIZE / (REC_N_NEW_EXTRA_BYTES + 1));
return((ulint) n);
}
diff --git a/storage/xtradb/page/page0zip.c b/storage/xtradb/page/page0zip.c
index 5b4f5d3b76a..3d5c5a226c7 100644
--- a/storage/xtradb/page/page0zip.c
+++ b/storage/xtradb/page/page0zip.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 2005, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2005, 2011, Oracle and/or its affiliates. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -151,6 +151,20 @@ page_zip_empty_size(
#endif /* !UNIV_HOTBACKUP */
/*************************************************************//**
+Gets the number of elements in the dense page directory,
+including deleted records (the free list).
+@return number of elements in the dense page directory */
+UNIV_INLINE
+ulint
+page_zip_dir_elems(
+/*===============*/
+ const page_zip_des_t* page_zip) /*!< in: compressed page */
+{
+ /* Exclude the page infimum and supremum from the record count. */
+ return(page_dir_get_n_heap(page_zip->data) - PAGE_HEAP_NO_USER_LOW);
+}
+
+/*************************************************************//**
Gets the size of the compressed page trailer (the dense page directory),
including deleted records (the free list).
@return length of dense page directory, in bytes */
@@ -160,14 +174,42 @@ page_zip_dir_size(
/*==============*/
const page_zip_des_t* page_zip) /*!< in: compressed page */
{
- /* Exclude the page infimum and supremum from the record count. */
- ulint size = PAGE_ZIP_DIR_SLOT_SIZE
- * (page_dir_get_n_heap(page_zip->data)
- - PAGE_HEAP_NO_USER_LOW);
- return(size);
+ return(PAGE_ZIP_DIR_SLOT_SIZE * page_zip_dir_elems(page_zip));
+}
+
+/*************************************************************//**
+Gets an offset to the compressed page trailer (the dense page directory),
+including deleted records (the free list).
+@return offset of the dense page directory */
+UNIV_INLINE
+ulint
+page_zip_dir_start_offs(
+/*====================*/
+ const page_zip_des_t* page_zip, /*!< in: compressed page */
+ ulint n_dense) /*!< in: directory size */
+{
+ ut_ad(n_dense * PAGE_ZIP_DIR_SLOT_SIZE < page_zip_get_size(page_zip));
+
+ return(page_zip_get_size(page_zip) - n_dense * PAGE_ZIP_DIR_SLOT_SIZE);
}
/*************************************************************//**
+Gets a pointer to the compressed page trailer (the dense page directory),
+including deleted records (the free list).
+@param[in] page_zip compressed page
+@param[in] n_dense number of entries in the directory
+@return pointer to the dense page directory */
+#define page_zip_dir_start_low(page_zip, n_dense) \
+ ((page_zip)->data + page_zip_dir_start_offs(page_zip, n_dense))
+/*************************************************************//**
+Gets a pointer to the compressed page trailer (the dense page directory),
+including deleted records (the free list).
+@param[in] page_zip compressed page
+@return pointer to the dense page directory */
+#define page_zip_dir_start(page_zip) \
+ page_zip_dir_start_low(page_zip, page_zip_dir_elems(page_zip))
+
+/*************************************************************//**
Gets the size of the compressed page trailer (the dense page directory),
only including user records (excluding the free list).
@return length of dense page directory comprising existing records, in bytes */
@@ -653,13 +695,13 @@ page_zip_dir_encode(
Allocate memory for zlib. */
static
void*
-page_zip_malloc(
+page_zip_zalloc(
/*============*/
void* opaque, /*!< in/out: memory heap */
uInt items, /*!< in: number of items to allocate */
uInt size) /*!< in: size of an item in bytes */
{
- return(mem_heap_alloc(opaque, items * size));
+ return(mem_heap_zalloc(opaque, items * size));
}
/**********************************************************************//**
@@ -684,7 +726,7 @@ page_zip_set_alloc(
{
z_stream* strm = stream;
- strm->zalloc = page_zip_malloc;
+ strm->zalloc = page_zip_zalloc;
strm->zfree = page_zip_free;
strm->opaque = heap;
}
@@ -2246,8 +2288,7 @@ zlib_done:
}
/* Restore the uncompressed columns in heap_no order. */
- storage = page_zip->data + page_zip_get_size(page_zip)
- - n_dense * PAGE_ZIP_DIR_SLOT_SIZE;
+ storage = page_zip_dir_start_low(page_zip, n_dense);
for (slot = 0; slot < n_dense; slot++) {
rec_t* rec = recs[slot];
@@ -2732,8 +2773,7 @@ zlib_done:
return(FALSE);
}
- storage = page_zip->data + page_zip_get_size(page_zip)
- - n_dense * PAGE_ZIP_DIR_SLOT_SIZE;
+ storage = page_zip_dir_start_low(page_zip, n_dense);
externs = storage - n_dense
* (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
@@ -2916,19 +2956,18 @@ zlib_error:
page_zip_set_alloc(&d_stream, heap);
- if (UNIV_UNLIKELY(inflateInit2(&d_stream, UNIV_PAGE_SIZE_SHIFT)
- != Z_OK)) {
- ut_error;
- }
-
d_stream.next_in = page_zip->data + PAGE_DATA;
/* Subtract the space reserved for
the page header and the end marker of the modification log. */
d_stream.avail_in = page_zip_get_size(page_zip) - (PAGE_DATA + 1);
-
d_stream.next_out = page + PAGE_ZIP_START;
d_stream.avail_out = UNIV_PAGE_SIZE - PAGE_ZIP_START;
+ if (UNIV_UNLIKELY(inflateInit2(&d_stream, UNIV_PAGE_SIZE_SHIFT)
+ != Z_OK)) {
+ ut_error;
+ }
+
/* Decode the zlib header and the index information. */
if (UNIV_UNLIKELY(inflate(&d_stream, Z_BLOCK) != Z_OK)) {
@@ -3462,9 +3501,7 @@ page_zip_write_rec(
}
/* Write the data bytes. Store the uncompressed bytes separately. */
- storage = page_zip->data + page_zip_get_size(page_zip)
- - (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW)
- * PAGE_ZIP_DIR_SLOT_SIZE;
+ storage = page_zip_dir_start(page_zip);
if (page_is_leaf(page)) {
ulint len;
@@ -3760,9 +3797,7 @@ corrupt:
field = page + offset;
storage = page_zip->data + z_offset;
- storage_end = page_zip->data + page_zip_get_size(page_zip)
- - (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW)
- * PAGE_ZIP_DIR_SLOT_SIZE;
+ storage_end = page_zip_dir_start(page_zip);
heap_no = 1 + (storage_end - storage) / REC_NODE_PTR_SIZE;
@@ -3798,7 +3833,9 @@ page_zip_write_node_ptr(
{
byte* field;
byte* storage;
+#ifdef UNIV_DEBUG
page_t* page = page_align(rec);
+#endif /* UNIV_DEBUG */
ut_ad(PAGE_ZIP_MATCH(rec, page_zip));
ut_ad(page_simple_validate_new(page));
@@ -3815,9 +3852,7 @@ page_zip_write_node_ptr(
UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip));
UNIV_MEM_ASSERT_RW(rec, size);
- storage = page_zip->data + page_zip_get_size(page_zip)
- - (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW)
- * PAGE_ZIP_DIR_SLOT_SIZE
+ storage = page_zip_dir_start(page_zip)
- (rec_get_heap_no_new(rec) - 1) * REC_NODE_PTR_SIZE;
field = rec + size - REC_NODE_PTR_SIZE;
@@ -3866,7 +3901,9 @@ page_zip_write_trx_id_and_roll_ptr(
{
byte* field;
byte* storage;
+#ifdef UNIV_DEBUG
page_t* page = page_align(rec);
+#endif /* UNIV_DEBUG */
ulint len;
ut_ad(PAGE_ZIP_MATCH(rec, page_zip));
@@ -3884,9 +3921,7 @@ page_zip_write_trx_id_and_roll_ptr(
UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip));
- storage = page_zip->data + page_zip_get_size(page_zip)
- - (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW)
- * PAGE_ZIP_DIR_SLOT_SIZE
+ storage = page_zip_dir_start(page_zip)
- (rec_get_heap_no_new(rec) - 1)
* (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
@@ -3917,17 +3952,9 @@ page_zip_write_trx_id_and_roll_ptr(
UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip));
}
-#ifdef UNIV_ZIP_DEBUG
-/** Set this variable in a debugger to disable page_zip_clear_rec().
-The only observable effect should be the compression ratio due to
-deleted records not being zeroed out. In rare cases, there can be
-page_zip_validate() failures on the node_ptr, trx_id and roll_ptr
-columns if the space is reallocated for a smaller record. */
-UNIV_INTERN ibool page_zip_clear_rec_disable;
-#endif /* UNIV_ZIP_DEBUG */
-
/**********************************************************************//**
-Clear an area on the uncompressed and compressed page, if possible. */
+Clear an area on the uncompressed and compressed page.
+Do not clear the data payload, as that would grow the modification log. */
static
void
page_zip_clear_rec(
@@ -3939,6 +3966,9 @@ page_zip_clear_rec(
{
ulint heap_no;
page_t* page = page_align(rec);
+ byte* storage;
+ byte* field;
+ ulint len;
/* page_zip_validate() would fail here if a record
containing externally stored columns is being deleted. */
ut_ad(rec_offs_validate(rec, index, offsets));
@@ -3954,60 +3984,38 @@ page_zip_clear_rec(
UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets),
rec_offs_extra_size(offsets));
- if (
-#ifdef UNIV_ZIP_DEBUG
- !page_zip_clear_rec_disable &&
-#endif /* UNIV_ZIP_DEBUG */
- page_zip->m_end
- + 1 + ((heap_no - 1) >= 64)/* size of the log entry */
- + page_zip_get_trailer_len(page_zip,
- dict_index_is_clust(index), NULL)
- < page_zip_get_size(page_zip)) {
- byte* data;
-
- /* Clear only the data bytes, because the allocator and
- the decompressor depend on the extra bytes. */
- memset(rec, 0, rec_offs_data_size(offsets));
-
- if (!page_is_leaf(page)) {
- /* Clear node_ptr on the compressed page. */
- byte* storage = page_zip->data
- + page_zip_get_size(page_zip)
- - (page_dir_get_n_heap(page)
- - PAGE_HEAP_NO_USER_LOW)
- * PAGE_ZIP_DIR_SLOT_SIZE;
-
- memset(storage - (heap_no - 1) * REC_NODE_PTR_SIZE,
- 0, REC_NODE_PTR_SIZE);
- } else if (dict_index_is_clust(index)) {
- /* Clear trx_id and roll_ptr on the compressed page. */
- byte* storage = page_zip->data
- + page_zip_get_size(page_zip)
- - (page_dir_get_n_heap(page)
- - PAGE_HEAP_NO_USER_LOW)
- * PAGE_ZIP_DIR_SLOT_SIZE;
-
- memset(storage - (heap_no - 1)
- * (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN),
- 0, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
- }
+ if (!page_is_leaf(page)) {
+ /* Clear node_ptr. On the compressed page,
+ there is an array of node_ptr immediately before the
+ dense page directory, at the very end of the page. */
+ storage = page_zip_dir_start(page_zip);
+ ut_ad(dict_index_get_n_unique_in_tree(index) ==
+ rec_offs_n_fields(offsets) - 1);
+ field = rec_get_nth_field(rec, offsets,
+ rec_offs_n_fields(offsets) - 1,
+ &len);
+ ut_ad(len == REC_NODE_PTR_SIZE);
- /* Log that the data was zeroed out. */
- data = page_zip->data + page_zip->m_end;
- ut_ad(!*data);
- if (UNIV_UNLIKELY(heap_no - 1 >= 64)) {
- *data++ = (byte) (0x80 | (heap_no - 1) >> 7);
- ut_ad(!*data);
- }
- *data++ = (byte) ((heap_no - 1) << 1 | 1);
- ut_ad(!*data);
- ut_ad((ulint) (data - page_zip->data)
- < page_zip_get_size(page_zip));
- page_zip->m_end = data - page_zip->data;
- page_zip->m_nonempty = TRUE;
- } else if (page_is_leaf(page) && dict_index_is_clust(index)) {
- /* Do not clear the record, because there is not enough space
- to log the operation. */
+ ut_ad(!rec_offs_any_extern(offsets));
+ memset(field, 0, REC_NODE_PTR_SIZE);
+ memset(storage - (heap_no - 1) * REC_NODE_PTR_SIZE,
+ 0, REC_NODE_PTR_SIZE);
+ } else if (dict_index_is_clust(index)) {
+ /* Clear trx_id and roll_ptr. On the compressed page,
+ there is an array of these fields immediately before the
+ dense page directory, at the very end of the page. */
+ const ulint trx_id_pos
+ = dict_col_get_clust_pos(
+ dict_table_get_sys_col(
+ index->table, DATA_TRX_ID), index);
+ storage = page_zip_dir_start(page_zip);
+ field = rec_get_nth_field(rec, offsets, trx_id_pos, &len);
+ ut_ad(len == DATA_TRX_ID_LEN);
+
+ memset(field, 0, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+ memset(storage - (heap_no - 1)
+ * (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN),
+ 0, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
if (rec_offs_any_extern(offsets)) {
ulint i;
@@ -4016,15 +4024,18 @@ page_zip_clear_rec(
/* Clear all BLOB pointers in order to make
page_zip_validate() pass. */
if (rec_offs_nth_extern(offsets, i)) {
- ulint len;
- byte* field = rec_get_nth_field(
+ field = rec_get_nth_field(
rec, offsets, i, &len);
+ ut_ad(len
+ == BTR_EXTERN_FIELD_REF_SIZE);
memset(field + len
- BTR_EXTERN_FIELD_REF_SIZE,
0, BTR_EXTERN_FIELD_REF_SIZE);
}
}
}
+ } else {
+ ut_ad(!rec_offs_any_extern(offsets));
}
#ifdef UNIV_ZIP_DEBUG
@@ -4455,6 +4466,8 @@ page_zip_reorganize(
/* Copy the old page to temporary space */
buf_frame_copy(temp_page, page);
+ btr_blob_dbg_remove(page, index, "zip_reorg");
+
/* Recreate the page: note that global data on page (possible
segment headers, next page-field, etc.) is preserved intact */
@@ -4513,7 +4526,7 @@ page_zip_copy_recs(
mtr_t* mtr) /*!< in: mini-transaction */
{
ut_ad(mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_X_FIX));
- ut_ad(mtr_memo_contains_page(mtr, (page_t*) src, MTR_MEMO_PAGE_X_FIX));
+ ut_ad(mtr_memo_contains_page(mtr, src, MTR_MEMO_PAGE_X_FIX));
ut_ad(!dict_index_is_ibuf(index));
#ifdef UNIV_ZIP_DEBUG
/* The B-tree operations that call this function may set
@@ -4583,6 +4596,7 @@ page_zip_copy_recs(
#ifdef UNIV_ZIP_DEBUG
ut_a(page_zip_validate(page_zip, page));
#endif /* UNIV_ZIP_DEBUG */
+ btr_blob_dbg_add(page, index, "page_zip_copy_recs");
page_zip_compress_write_log(page_zip, page, index, mtr);
}
diff --git a/storage/xtradb/que/que0que.c b/storage/xtradb/que/que0que.c
index 9c1d61c1731..5fccbb180fe 100644
--- a/storage/xtradb/que/que0que.c
+++ b/storage/xtradb/que/que0que.c
@@ -1418,6 +1418,12 @@ que_eval_sql(
ut_a(trx->error_state == DB_SUCCESS);
+ if (trx->fake_changes) {
+ /* fake_changes should not access to system tables */
+ fprintf(stderr, "InnoDB: ERROR: innodb_fake_changes tried to access to system tables.\n");
+ return(DB_ERROR);
+ }
+
if (reserve_dict_mutex) {
mutex_enter(&dict_sys->mutex);
}
diff --git a/storage/xtradb/rem/rem0rec.c b/storage/xtradb/rem/rem0rec.c
index 37ba8ca2ffe..9f90d2940dd 100644
--- a/storage/xtradb/rem/rem0rec.c
+++ b/storage/xtradb/rem/rem0rec.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -408,7 +408,7 @@ rec_init_offsets(
do {
ulint len;
if (UNIV_UNLIKELY(i == n_node_ptr_field)) {
- len = offs += 4;
+ len = offs += REC_NODE_PTR_SIZE;
goto resolved;
}
@@ -640,7 +640,7 @@ rec_get_offsets_reverse(
do {
ulint len;
if (UNIV_UNLIKELY(i == n_node_ptr_field)) {
- len = offs += 4;
+ len = offs += REC_NODE_PTR_SIZE;
goto resolved;
}
@@ -1131,9 +1131,9 @@ rec_convert_dtuple_to_rec_comp(
if (UNIV_UNLIKELY(i == n_node_ptr_field)) {
ut_ad(dtype_get_prtype(type) & DATA_NOT_NULL);
- ut_ad(len == 4);
+ ut_ad(len == REC_NODE_PTR_SIZE);
memcpy(end, dfield_get_data(field), len);
- end += 4;
+ end += REC_NODE_PTR_SIZE;
break;
}
diff --git a/storage/xtradb/row/row0ins.c b/storage/xtradb/row/row0ins.c
index efcea62f212..0bcbc14fbf6 100644
--- a/storage/xtradb/row/row0ins.c
+++ b/storage/xtradb/row/row0ins.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -345,9 +345,9 @@ row_ins_clust_index_entry_by_modify(
return(DB_LOCK_TABLE_FULL);
}
- err = btr_cur_pessimistic_update(0, cursor,
- heap, big_rec, update,
- 0, thr, mtr);
+ err = btr_cur_pessimistic_update(
+ BTR_KEEP_POS_FLAG, cursor, heap, big_rec, update,
+ 0, thr, mtr);
}
return(err);
@@ -434,11 +434,9 @@ row_ins_cascade_calc_update_vec(
dict_table_t* table = foreign->foreign_table;
dict_index_t* index = foreign->foreign_index;
upd_t* update;
- upd_field_t* ufield;
dict_table_t* parent_table;
dict_index_t* parent_index;
upd_t* parent_update;
- upd_field_t* parent_ufield;
ulint n_fields_updated;
ulint parent_field_no;
ulint i;
@@ -474,13 +472,15 @@ row_ins_cascade_calc_update_vec(
dict_index_get_nth_col_no(parent_index, i));
for (j = 0; j < parent_update->n_fields; j++) {
- parent_ufield = parent_update->fields + j;
+ const upd_field_t* parent_ufield
+ = &parent_update->fields[j];
if (parent_ufield->field_no == parent_field_no) {
ulint min_size;
const dict_col_t* col;
ulint ufield_len;
+ upd_field_t* ufield;
col = dict_index_get_nth_col(index, i);
@@ -493,6 +493,8 @@ row_ins_cascade_calc_update_vec(
ufield->field_no
= dict_table_get_nth_col_pos(
table, dict_col_get_no(col));
+
+ ufield->orig_len = 0;
ufield->exp = NULL;
ufield->new_val = parent_ufield->new_val;
@@ -993,10 +995,9 @@ row_ins_foreign_check_on_constraint(
goto nonstandard_exit_func;
}
- if ((node->is_delete
- && (foreign->type & DICT_FOREIGN_ON_DELETE_SET_NULL))
- || (!node->is_delete
- && (foreign->type & DICT_FOREIGN_ON_UPDATE_SET_NULL))) {
+ if (node->is_delete
+ ? (foreign->type & DICT_FOREIGN_ON_DELETE_SET_NULL)
+ : (foreign->type & DICT_FOREIGN_ON_UPDATE_SET_NULL)) {
/* Build the appropriate update vector which sets
foreign->n_fields first fields in rec to SQL NULL */
@@ -1005,6 +1006,8 @@ row_ins_foreign_check_on_constraint(
update->info_bits = 0;
update->n_fields = foreign->n_fields;
+ UNIV_MEM_INVALID(update->fields,
+ update->n_fields * sizeof *update->fields);
for (i = 0; i < foreign->n_fields; i++) {
upd_field_t* ufield = &update->fields[i];
@@ -1512,6 +1515,11 @@ exit_func:
if (UNIV_LIKELY_NULL(heap)) {
mem_heap_free(heap);
}
+
+ if (trx->fake_changes) {
+ err = DB_SUCCESS;
+ }
+
return(err);
}
@@ -1992,6 +2000,7 @@ row_ins_index_entry_low(
ulint modify = 0; /* remove warning */
rec_t* insert_rec;
rec_t* rec;
+ ulint* offsets;
ulint err;
ulint n_unique;
big_rec_t* big_rec = NULL;
@@ -2013,7 +2022,7 @@ row_ins_index_entry_low(
}
btr_cur_search_to_nth_level(index, 0, entry, PAGE_CUR_LE,
- mode | BTR_INSERT | ignore_sec_unique,
+ thr_get_trx(thr)->fake_changes ? BTR_SEARCH_LEAF : (mode | BTR_INSERT | ignore_sec_unique),
&cursor, 0, __FILE__, __LINE__, &mtr);
if (cursor.flag == BTR_CUR_INSERT_TO_IBUF) {
@@ -2073,7 +2082,7 @@ row_ins_index_entry_low(
btr_cur_search_to_nth_level(index, 0, entry,
PAGE_CUR_LE,
- mode | BTR_INSERT,
+ thr_get_trx(thr)->fake_changes ? BTR_SEARCH_LEAF : (mode | BTR_INSERT),
&cursor, 0,
__FILE__, __LINE__, &mtr);
}
@@ -2097,6 +2106,42 @@ row_ins_index_entry_low(
err = row_ins_clust_index_entry_by_modify(
mode, &cursor, &heap, &big_rec, entry,
thr, &mtr);
+
+ if (big_rec) {
+ ut_a(err == DB_SUCCESS);
+ /* Write out the externally stored
+ columns while still x-latching
+ index->lock and block->lock. We have
+ to mtr_commit(mtr) first, so that the
+ redo log will be written in the
+ correct order. Otherwise, we would run
+ into trouble on crash recovery if mtr
+ freed B-tree pages on which some of
+ the big_rec fields will be written. */
+ btr_cur_mtr_commit_and_start(&cursor, &mtr);
+
+ rec = btr_cur_get_rec(&cursor);
+ offsets = rec_get_offsets(
+ rec, index, NULL,
+ ULINT_UNDEFINED, &heap);
+
+ err = btr_store_big_rec_extern_fields(
+ index, btr_cur_get_block(&cursor),
+ rec, offsets, &mtr, FALSE, big_rec);
+ /* If writing big_rec fails (for
+ example, because of DB_OUT_OF_FILE_SPACE),
+ the record will be corrupted. Even if
+ we did not update any externally
+ stored columns, our update could cause
+ the record to grow so that a
+ non-updated column was selected for
+ external storage. This non-update
+ would not have been written to the
+ undo log, and thus the record cannot
+ be rolled back. */
+ ut_a(err == DB_SUCCESS);
+ goto stored_big_rec;
+ }
} else {
ut_ad(!n_ext);
err = row_ins_sec_index_entry_by_modify(
@@ -2125,8 +2170,22 @@ function_exit:
mtr_commit(&mtr);
if (UNIV_LIKELY_NULL(big_rec)) {
- rec_t* rec;
- ulint* offsets;
+
+ if (thr_get_trx(thr)->fake_changes) {
+ /* skip store extern */
+ if (modify) {
+ dtuple_big_rec_free(big_rec);
+ } else {
+ dtuple_convert_back_big_rec(index, entry, big_rec);
+ }
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+
+ return(err);
+ }
+
mtr_start(&mtr);
btr_cur_search_to_nth_level(index, 0, entry, PAGE_CUR_LE,
@@ -2140,6 +2199,7 @@ function_exit:
index, btr_cur_get_block(&cursor),
rec, offsets, &mtr, FALSE, big_rec);
+stored_big_rec:
if (modify) {
dtuple_big_rec_free(big_rec);
} else {
diff --git a/storage/xtradb/row/row0mysql.c b/storage/xtradb/row/row0mysql.c
index 1b97cbb0009..cb003ad62e0 100644
--- a/storage/xtradb/row/row0mysql.c
+++ b/storage/xtradb/row/row0mysql.c
@@ -1190,6 +1190,7 @@ run_again:
prebuilt->table->stat_n_rows--;
}
+ if (!(trx->fake_changes))
row_update_statistics_if_needed(prebuilt->table);
trx->op_info = "";
@@ -1450,6 +1451,7 @@ run_again:
that changes indexed columns, UPDATEs that change only non-indexed
columns would not affect statistics. */
if (node->is_delete || !(node->cmpl_info & UPD_NODE_NO_ORD_CHANGE)) {
+ if (!(trx->fake_changes))
row_update_statistics_if_needed(prebuilt->table);
}
@@ -1668,6 +1670,7 @@ run_again:
srv_n_rows_updated++;
}
+ if (!(trx->fake_changes))
row_update_statistics_if_needed(table);
return(err);
@@ -2556,10 +2559,29 @@ row_discard_tablespace_for_mysql(
err = DB_ERROR;
} else {
+ dict_index_t* index;
+
/* Set the flag which tells that now it is legal to
IMPORT a tablespace for this table */
table->tablespace_discarded = TRUE;
table->ibd_file_missing = TRUE;
+
+ /* check adaptive hash entries */
+ index = dict_table_get_first_index(table);
+ while (index) {
+ ulint ref_count = btr_search_info_get_ref_count(index->search_info);
+ if (ref_count) {
+ fprintf(stderr, "InnoDB: Warning:"
+ " hash index ref_count (%lu) is not zero"
+ " after fil_discard_tablespace().\n"
+ "index: \"%s\""
+ " table: \"%s\"\n",
+ ref_count,
+ index->name,
+ table->name);
+ }
+ index = dict_table_get_next_index(index);
+ }
}
}
@@ -2597,6 +2619,11 @@ row_import_tablespace_for_mysql(
current_lsn = log_get_lsn();
+ /* Enlarge the fatal lock wait timeout during import. */
+ mutex_enter(&kernel_mutex);
+ srv_fatal_semaphore_wait_threshold += 7200; /* 2 hours */
+ mutex_exit(&kernel_mutex);
+
/* It is possible, though very improbable, that the lsn's in the
tablespace to be imported have risen above the current system lsn, if
a lengthy purge, ibuf merge, or rollback was performed on a backup
@@ -2708,6 +2735,11 @@ funct_exit:
trx->op_info = "";
+ /* Restore the fatal semaphore wait timeout */
+ mutex_enter(&kernel_mutex);
+ srv_fatal_semaphore_wait_threshold -= 7200; /* 2 hours */
+ mutex_exit(&kernel_mutex);
+
return((int) err);
}
@@ -2901,6 +2933,19 @@ row_truncate_table_for_mysql(
table->space = space;
index = dict_table_get_first_index(table);
do {
+ ulint ref_count = btr_search_info_get_ref_count(index->search_info);
+ /* check adaptive hash entries */
+ if (ref_count) {
+ fprintf(stderr, "InnoDB: Warning:"
+ " hash index ref_count (%lu) is not zero"
+ " after fil_discard_tablespace().\n"
+ "index: \"%s\""
+ " table: \"%s\"\n",
+ ref_count,
+ index->name,
+ table->name);
+ }
+
index->space = space;
index = dict_table_get_next_index(index);
} while (index);
diff --git a/storage/xtradb/row/row0row.c b/storage/xtradb/row/row0row.c
index 0783d482f76..cea70e98dee 100644
--- a/storage/xtradb/row/row0row.c
+++ b/storage/xtradb/row/row0row.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -47,35 +47,6 @@ Created 4/20/1996 Heikki Tuuri
#include "read0read.h"
#include "ut0mem.h"
-/*********************************************************************//**
-Gets the offset of trx id field, in bytes relative to the origin of
-a clustered index record.
-@return offset of DATA_TRX_ID */
-UNIV_INTERN
-ulint
-row_get_trx_id_offset(
-/*==================*/
- const rec_t* rec __attribute__((unused)),
- /*!< in: record */
- dict_index_t* index, /*!< in: clustered index */
- const ulint* offsets)/*!< in: rec_get_offsets(rec, index) */
-{
- ulint pos;
- ulint offset;
- ulint len;
-
- ut_ad(dict_index_is_clust(index));
- ut_ad(rec_offs_validate(rec, index, offsets));
-
- pos = dict_index_get_sys_col_pos(index, DATA_TRX_ID);
-
- offset = rec_get_nth_field_offs(offsets, pos, &len);
-
- ut_ad(len == DATA_TRX_ID_LEN);
-
- return(offset);
-}
-
/*****************************************************************//**
When an insert or purge to a table is performed, this function builds
the entry to be inserted into or purged from an index on the table.
@@ -130,12 +101,27 @@ row_build_index_entry(
dfield_copy(dfield, dfield2);
- if (dfield_is_null(dfield) || ind_field->prefix_len == 0) {
+ if (dfield_is_null(dfield)) {
+ continue;
+ }
+
+ if (ind_field->prefix_len == 0
+ && (!dfield_is_ext(dfield)
+ || dict_index_is_clust(index))) {
+ /* The dfield_copy() above suffices for
+ columns that are stored in-page, or for
+ clustered index record columns that are not
+ part of a column prefix in the PRIMARY KEY. */
continue;
}
- /* If a column prefix index, take only the prefix.
- Prefix-indexed columns may be externally stored. */
+ /* If the column is stored externally (off-page) in
+ the clustered index, it must be an ordering field in
+ the secondary index. In the Antelope format, only
+ prefix-indexed columns may be stored off-page in the
+ clustered index record. In the Barracuda format, also
+ fully indexed long CHAR or VARCHAR columns may be
+ stored off-page. */
ut_ad(col->ord_part);
if (UNIV_LIKELY_NULL(ext)) {
@@ -148,17 +134,41 @@ row_build_index_entry(
}
dfield_set_data(dfield, buf, len);
}
+
+ if (ind_field->prefix_len == 0) {
+ /* In the Barracuda format
+ (ROW_FORMAT=DYNAMIC or
+ ROW_FORMAT=COMPRESSED), we can have a
+ secondary index on an entire column
+ that is stored off-page in the
+ clustered index. As this is not a
+ prefix index (prefix_len == 0),
+ include the entire off-page column in
+ the secondary index record. */
+ continue;
+ }
} else if (dfield_is_ext(dfield)) {
+ /* This table is either in Antelope format
+ (ROW_FORMAT=REDUNDANT or ROW_FORMAT=COMPACT)
+ or a purge record where the ordered part of
+ the field is not external.
+ In Antelope, the maximum column prefix
+ index length is 767 bytes, and the clustered
+ index record contains a 768-byte prefix of
+ each off-page column. */
ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
len -= BTR_EXTERN_FIELD_REF_SIZE;
- ut_a(ind_field->prefix_len <= len
- || dict_index_is_clust(index));
+ dfield_set_len(dfield, len);
}
- len = dtype_get_at_most_n_mbchars(
- col->prtype, col->mbminlen, col->mbmaxlen,
- ind_field->prefix_len, len, dfield_get_data(dfield));
- dfield_set_len(dfield, len);
+ /* If a column prefix index, take only the prefix. */
+ if (ind_field->prefix_len) {
+ len = dtype_get_at_most_n_mbchars(
+ col->prtype, col->mbminlen, col->mbmaxlen,
+ ind_field->prefix_len, len,
+ dfield_get_data(dfield));
+ dfield_set_len(dfield, len);
+ }
}
ut_ad(dtuple_check_typed(entry));
@@ -223,6 +233,7 @@ row_build(
ut_ad(index && rec && heap);
ut_ad(dict_index_is_clust(index));
+ ut_ad(!mutex_own(&kernel_mutex));
if (!offsets) {
offsets = rec_get_offsets(rec, index, offsets_,
@@ -231,6 +242,22 @@ row_build(
ut_ad(rec_offs_validate(rec, index, offsets));
}
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+ /* This condition can occur during crash recovery before
+ trx_rollback_active() has completed execution.
+
+ This condition is possible if the server crashed
+ during an insert or update before
+ btr_store_big_rec_extern_fields() did mtr_commit() all
+ BLOB pointers to the clustered index record.
+
+ If the record contains a null BLOB pointer, look up the
+ transaction that holds the implicit lock on this record, and
+ assert that it was recovered (and will soon be rolled back). */
+ ut_a(!rec_offs_any_null_extern(rec, offsets)
+ || trx_assert_recovered(row_get_rec_trx_id(rec, index, offsets)));
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
+
if (type != ROW_COPY_POINTERS) {
/* Take a copy of rec to heap */
buf = mem_heap_alloc(heap, rec_offs_size(offsets));
@@ -431,6 +458,10 @@ row_rec_to_index_entry(
rec = rec_copy(buf, rec, offsets);
/* Avoid a debug assertion in rec_offs_validate(). */
rec_offs_make_valid(rec, index, offsets);
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+ } else {
+ ut_a(!rec_offs_any_null_extern(rec, offsets));
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
}
entry = row_rec_to_index_entry_low(rec, index, offsets, n_ext, heap);
diff --git a/storage/xtradb/row/row0sel.c b/storage/xtradb/row/row0sel.c
index 1dbd9e3a42d..4ae55b47b5b 100644
--- a/storage/xtradb/row/row0sel.c
+++ b/storage/xtradb/row/row0sel.c
@@ -2546,6 +2546,8 @@ row_sel_field_store_in_mysql_format(
ut_ad(len != UNIV_SQL_NULL);
UNIV_MEM_ASSERT_RW(data, len);
+ UNIV_MEM_ASSERT_W(dest, templ->mysql_col_len);
+ UNIV_MEM_INVALID(dest, templ->mysql_col_len);
switch (templ->type) {
case DATA_INT:
@@ -2582,14 +2584,16 @@ row_sel_field_store_in_mysql_format(
dest = row_mysql_store_true_var_len(
dest, len, templ->mysql_length_bytes);
+ /* Copy the actual data. Leave the rest of the
+ buffer uninitialized. */
+ memcpy(dest, data, len);
+ break;
}
/* Copy the actual data */
ut_memcpy(dest, data, len);
- /* Pad with trailing spaces. We pad with spaces also the
- unused end of a >= 5.0.3 true VARCHAR column, just in case
- MySQL expects its contents to be deterministic. */
+ /* Pad with trailing spaces. */
pad_ptr = dest + len;
@@ -3155,6 +3159,39 @@ sel_restore_position_for_mysql(
}
/********************************************************************//**
+Copies a cached field for MySQL from the fetch cache. */
+static
+void
+row_sel_copy_cached_field_for_mysql(
+/*================================*/
+ byte* buf, /*!< in/out: row buffer */
+ const byte* cache, /*!< in: cached row */
+ const mysql_row_templ_t*templ) /*!< in: column template */
+{
+ ulint len;
+
+ buf += templ->mysql_col_offset;
+ cache += templ->mysql_col_offset;
+
+ UNIV_MEM_ASSERT_W(buf, templ->mysql_col_len);
+
+ if (templ->mysql_type == DATA_MYSQL_TRUE_VARCHAR
+ && templ->type != DATA_INT) {
+ /* Check for != DATA_INT to make sure we do
+ not treat MySQL ENUM or SET as a true VARCHAR!
+ Find the actual length of the true VARCHAR field. */
+ row_mysql_read_true_varchar(
+ &len, cache, templ->mysql_length_bytes);
+ len += templ->mysql_length_bytes;
+ UNIV_MEM_INVALID(buf, templ->mysql_col_len);
+ } else {
+ len = templ->mysql_col_len;
+ }
+
+ ut_memcpy(buf, cache, len);
+}
+
+/********************************************************************//**
Pops a cached row for MySQL from the fetch cache. */
UNIV_INLINE
void
@@ -3166,26 +3203,22 @@ row_sel_pop_cached_row_for_mysql(
{
ulint i;
const mysql_row_templ_t*templ;
- byte* cached_rec;
+ const byte* cached_rec;
ut_ad(prebuilt->n_fetch_cached > 0);
ut_ad(prebuilt->mysql_prefix_len <= prebuilt->mysql_row_len);
+ UNIV_MEM_ASSERT_W(buf, prebuilt->mysql_row_len);
+
+ cached_rec = prebuilt->fetch_cache[prebuilt->fetch_cache_first];
+
if (UNIV_UNLIKELY(prebuilt->keep_other_fields_on_keyread)) {
/* Copy cache record field by field, don't touch fields that
are not covered by current key */
- cached_rec = prebuilt->fetch_cache[
- prebuilt->fetch_cache_first];
for (i = 0; i < prebuilt->n_template; i++) {
templ = prebuilt->mysql_template + i;
-#if 0 /* Some of the cached_rec may legitimately be uninitialized. */
- UNIV_MEM_ASSERT_RW(cached_rec
- + templ->mysql_col_offset,
- templ->mysql_col_len);
-#endif
- ut_memcpy(buf + templ->mysql_col_offset,
- cached_rec + templ->mysql_col_offset,
- templ->mysql_col_len);
+ row_sel_copy_cached_field_for_mysql(
+ buf, cached_rec, templ);
/* Copy NULL bit of the current field from cached_rec
to buf */
if (templ->mysql_null_bit_mask) {
@@ -3199,17 +3232,24 @@ row_sel_pop_cached_row_for_mysql(
templ->mysql_null_bit_mask;
}
}
+ } else if (prebuilt->mysql_prefix_len > 63) {
+ /* The record is long. Copy it field by field, in case
+ there are some long VARCHAR column of which only a
+ small length is being used. */
+ UNIV_MEM_INVALID(buf, prebuilt->mysql_prefix_len);
+
+ /* First copy the NULL bits. */
+ ut_memcpy(buf, cached_rec, prebuilt->null_bitmap_len);
+ /* Then copy the requested fields. */
+
+ for (i = 0; i < prebuilt->n_template; i++) {
+ row_sel_copy_cached_field_for_mysql(
+ buf, cached_rec, prebuilt->mysql_template + i);
+ }
+ } else {
+ ut_memcpy(buf, cached_rec, prebuilt->mysql_prefix_len);
}
- else {
-#if 0 /* Some of the cached_rec may legitimately be uninitialized. */
- UNIV_MEM_ASSERT_RW(prebuilt->fetch_cache
- [prebuilt->fetch_cache_first],
- prebuilt->mysql_prefix_len);
-#endif
- ut_memcpy(buf,
- prebuilt->fetch_cache[prebuilt->fetch_cache_first],
- prebuilt->mysql_prefix_len);
- }
+
prebuilt->n_fetch_cached--;
prebuilt->fetch_cache_first++;
@@ -4086,7 +4126,13 @@ rec_loop:
if (UNIV_UNLIKELY(next_offs >= UNIV_PAGE_SIZE - PAGE_DIR)) {
wrong_offs:
- if (srv_force_recovery == 0 || moves_up == FALSE) {
+ if (srv_pass_corrupt_table && !trx_sys_sys_space(index->table->space)) {
+ index->table->is_corrupt = TRUE;
+ fil_space_set_corrupt(index->table->space);
+ }
+
+ if ((srv_force_recovery == 0 || moves_up == FALSE)
+ && srv_pass_corrupt_table <= 1) {
ut_print_timestamp(stderr);
buf_page_print(page_align(rec), 0);
fprintf(stderr,
@@ -4137,7 +4183,8 @@ wrong_offs:
offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
- if (UNIV_UNLIKELY(srv_force_recovery > 0)) {
+ if (UNIV_UNLIKELY(srv_force_recovery > 0)
+ || (srv_pass_corrupt_table == 2 && index->table->is_corrupt)) {
if (!rec_validate(rec, offsets)
|| !btr_index_rec_validate(rec, index, FALSE)) {
fprintf(stderr,
diff --git a/storage/xtradb/row/row0upd.c b/storage/xtradb/row/row0upd.c
index a6fb266c4ed..5765d3b76d2 100644
--- a/storage/xtradb/row/row0upd.c
+++ b/storage/xtradb/row/row0upd.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -504,14 +504,49 @@ row_upd_rec_in_place(
n_fields = upd_get_n_fields(update);
for (i = 0; i < n_fields; i++) {
+#ifdef UNIV_BLOB_DEBUG
+ btr_blob_dbg_t b;
+ const byte* field_ref = NULL;
+#endif /* UNIV_BLOB_DEBUG */
+
upd_field = upd_get_nth_field(update, i);
new_val = &(upd_field->new_val);
ut_ad(!dfield_is_ext(new_val) ==
!rec_offs_nth_extern(offsets, upd_field->field_no));
+#ifdef UNIV_BLOB_DEBUG
+ if (dfield_is_ext(new_val)) {
+ ulint len;
+ field_ref = rec_get_nth_field(rec, offsets, i, &len);
+ ut_a(len != UNIV_SQL_NULL);
+ ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
+ field_ref += len - BTR_EXTERN_FIELD_REF_SIZE;
+
+ b.ref_page_no = page_get_page_no(page_align(rec));
+ b.ref_heap_no = page_rec_get_heap_no(rec);
+ b.ref_field_no = i;
+ b.blob_page_no = mach_read_from_4(
+ field_ref + BTR_EXTERN_PAGE_NO);
+ ut_a(b.ref_field_no >= index->n_uniq);
+ btr_blob_dbg_rbt_delete(index, &b, "upd_in_place");
+ }
+#endif /* UNIV_BLOB_DEBUG */
rec_set_nth_field(rec, offsets, upd_field->field_no,
dfield_get_data(new_val),
dfield_get_len(new_val));
+
+#ifdef UNIV_BLOB_DEBUG
+ if (dfield_is_ext(new_val)) {
+ b.blob_page_no = mach_read_from_4(
+ field_ref + BTR_EXTERN_PAGE_NO);
+ b.always_owner = b.owner = !(field_ref[BTR_EXTERN_LEN]
+ & BTR_EXTERN_OWNER_FLAG);
+ b.del = rec_get_deleted_flag(
+ rec, rec_offs_comp(offsets));
+
+ btr_blob_dbg_rbt_insert(index, &b, "upd_in_place");
+ }
+#endif /* UNIV_BLOB_DEBUG */
}
if (UNIV_LIKELY_NULL(page_zip)) {
@@ -1556,8 +1591,9 @@ row_upd_sec_index_entry(
mtr_start(&mtr);
- found = row_search_index_entry(index, entry, BTR_MODIFY_LEAF, &pcur,
- &mtr);
+ found = row_search_index_entry(index, entry,
+ trx->fake_changes ? BTR_SEARCH_LEAF : BTR_MODIFY_LEAF,
+ &pcur, &mtr);
btr_cur = btr_pcur_get_btr_cur(&pcur);
rec = btr_cur_get_rec(btr_cur);
@@ -1787,9 +1823,11 @@ row_upd_clust_rec_by_insert(
the previous invocation of this function. Mark the
off-page columns in the entry inherited. */
+ if (!(trx->fake_changes)) {
change_ownership = row_upd_clust_rec_by_insert_inherit(
NULL, NULL, entry, node->update);
ut_a(change_ownership);
+ }
/* fall through */
case UPD_NODE_INSERT_CLUSTERED:
/* A lock wait occurred in row_ins_index_entry() in
@@ -1819,7 +1857,7 @@ err_exit:
delete-marked old record, mark them disowned by the
old record and owned by the new entry. */
- if (rec_offs_any_extern(offsets)) {
+ if (rec_offs_any_extern(offsets) && !(trx->fake_changes)) {
change_ownership = row_upd_clust_rec_by_insert_inherit(
rec, offsets, entry, node->update);
@@ -1947,33 +1985,50 @@ row_upd_clust_rec(
the same transaction do not modify the record in the meantime.
Therefore we can assert that the restoration of the cursor succeeds. */
- ut_a(btr_pcur_restore_position(BTR_MODIFY_TREE, pcur, mtr));
+ ut_a(btr_pcur_restore_position(thr_get_trx(thr)->fake_changes ? BTR_SEARCH_LEAF : BTR_MODIFY_TREE,
+ pcur, mtr));
ut_ad(!rec_get_deleted_flag(btr_pcur_get_rec(pcur),
dict_table_is_comp(index->table)));
- err = btr_cur_pessimistic_update(BTR_NO_LOCKING_FLAG, btr_cur,
- &heap, &big_rec, node->update,
- node->cmpl_info, thr, mtr);
- mtr_commit(mtr);
-
- if (err == DB_SUCCESS && big_rec) {
+ err = btr_cur_pessimistic_update(
+ BTR_NO_LOCKING_FLAG | BTR_KEEP_POS_FLAG, btr_cur,
+ &heap, &big_rec, node->update, node->cmpl_info, thr, mtr);
+ /* skip store extern for fake_changes */
+ if (big_rec && !(thr_get_trx(thr)->fake_changes)) {
ulint offsets_[REC_OFFS_NORMAL_SIZE];
rec_t* rec;
rec_offs_init(offsets_);
- mtr_start(mtr);
+ ut_a(err == DB_SUCCESS);
+ /* Write out the externally stored columns while still
+ x-latching index->lock and block->lock. We have to
+ mtr_commit(mtr) first, so that the redo log will be
+ written in the correct order. Otherwise, we would run
+ into trouble on crash recovery if mtr freed B-tree
+ pages on which some of the big_rec fields will be
+ written. */
+ btr_cur_mtr_commit_and_start(btr_cur, mtr);
- ut_a(btr_pcur_restore_position(BTR_MODIFY_TREE, pcur, mtr));
rec = btr_cur_get_rec(btr_cur);
err = btr_store_big_rec_extern_fields(
index, btr_cur_get_block(btr_cur), rec,
rec_get_offsets(rec, index, offsets_,
ULINT_UNDEFINED, &heap),
mtr, TRUE, big_rec);
- mtr_commit(mtr);
+ /* If writing big_rec fails (for example, because of
+ DB_OUT_OF_FILE_SPACE), the record will be corrupted.
+ Even if we did not update any externally stored
+ columns, our update could cause the record to grow so
+ that a non-updated column was selected for external
+ storage. This non-update would not have been written
+ to the undo log, and thus the record cannot be rolled
+ back. */
+ ut_a(err == DB_SUCCESS);
}
+ mtr_commit(mtr);
+
if (UNIV_LIKELY_NULL(heap)) {
mem_heap_free(heap);
}
@@ -2082,7 +2137,8 @@ row_upd_clust_step(
ut_a(pcur->rel_pos == BTR_PCUR_ON);
- success = btr_pcur_restore_position(BTR_MODIFY_LEAF, pcur, mtr);
+ success = btr_pcur_restore_position(thr_get_trx(thr)->fake_changes ? BTR_SEARCH_LEAF : BTR_MODIFY_LEAF,
+ pcur, mtr);
if (!success) {
err = DB_RECORD_NOT_FOUND;
diff --git a/storage/xtradb/row/row0vers.c b/storage/xtradb/row/row0vers.c
index d4fde0b939b..8a7bb842293 100644
--- a/storage/xtradb/row/row0vers.c
+++ b/storage/xtradb/row/row0vers.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2011, Oracle and/or its affiliates. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -550,6 +550,11 @@ row_vers_build_for_consistent_read(
/* The view already sees this version: we can
copy it to in_heap and return */
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+ ut_a(!rec_offs_any_null_extern(
+ version, *offsets));
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
+
buf = mem_heap_alloc(in_heap,
rec_offs_size(*offsets));
*old_vers = rec_copy(buf, version, *offsets);
@@ -583,6 +588,10 @@ row_vers_build_for_consistent_read(
*offsets = rec_get_offsets(prev_version, index, *offsets,
ULINT_UNDEFINED, offset_heap);
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+ ut_a(!rec_offs_any_null_extern(prev_version, *offsets));
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
+
trx_id = row_get_rec_trx_id(prev_version, index, *offsets);
if (read_view_sees_trx_id(view, trx_id)) {
@@ -682,6 +691,10 @@ row_vers_build_for_semi_consistent_read(
/* We found a version that belongs to a
committed transaction: return it. */
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+ ut_a(!rec_offs_any_null_extern(version, *offsets));
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
+
if (rec == version) {
*old_vers = rec;
err = DB_SUCCESS;
@@ -739,6 +752,9 @@ row_vers_build_for_semi_consistent_read(
version = prev_version;
*offsets = rec_get_offsets(version, index, *offsets,
ULINT_UNDEFINED, offset_heap);
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+ ut_a(!rec_offs_any_null_extern(version, *offsets));
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
}/* for (;;) */
if (heap) {
diff --git a/storage/xtradb/srv/srv0srv.c b/storage/xtradb/srv/srv0srv.c
index 9a142e1ca86..9d74c293405 100644
--- a/storage/xtradb/srv/srv0srv.c
+++ b/storage/xtradb/srv/srv0srv.c
@@ -86,6 +86,11 @@ Created 10/8/1995 Heikki Tuuri
#include "trx0i_s.h"
#include "os0sync.h" /* for HAVE_ATOMIC_BUILTINS */
+/* prototypes of new functions added to ha_innodb.cc for kill_idle_transaction */
+ibool innobase_thd_is_idle(const void* thd);
+ib_int64_t innobase_thd_get_start_time(const void* thd);
+void innobase_thd_kill(void* thd);
+
/* prototypes for new functions added to ha_innodb.cc */
ibool innobase_get_slow_log();
@@ -100,6 +105,9 @@ UNIV_INTERN ulint srv_activity_count = 0;
/* The following is the maximum allowed duration of a lock wait. */
UNIV_INTERN ulint srv_fatal_semaphore_wait_threshold = 600;
+/**/
+UNIV_INTERN long long srv_kill_idle_transaction = 0;
+
/* How much data manipulation language (DML) statements need to be delayed,
in microseconds, in order to reduce the lagging of the purge thread. */
UNIV_INTERN ulint srv_dml_needed_delay = 0;
@@ -225,17 +233,15 @@ UNIV_INTERN ulint srv_buf_pool_curr_size = 0;
UNIV_INTERN ulint srv_mem_pool_size = ULINT_MAX;
UNIV_INTERN ulint srv_lock_table_size = ULINT_MAX;
-/* key value for shm */
-UNIV_INTERN uint srv_buffer_pool_shm_key = 0;
-UNIV_INTERN ibool srv_buffer_pool_shm_is_reused = FALSE;
-UNIV_INTERN ibool srv_buffer_pool_shm_checksum = TRUE;
-
/* This parameter is deprecated. Use srv_n_io_[read|write]_threads
instead. */
UNIV_INTERN ulint srv_n_file_io_threads = ULINT_MAX;
UNIV_INTERN ulint srv_n_read_io_threads = ULINT_MAX;
UNIV_INTERN ulint srv_n_write_io_threads = ULINT_MAX;
+/* Switch to enable random read ahead. */
+UNIV_INTERN my_bool srv_random_read_ahead = FALSE;
+
/* The universal page size of the database */
UNIV_INTERN ulint srv_page_size_shift = 0;
UNIV_INTERN ulint srv_page_size = 0;
@@ -334,6 +340,9 @@ UNIV_INTERN ulint srv_buf_pool_reads = 0;
/** Time in seconds between automatic buffer pool dumps */
UNIV_INTERN uint srv_auto_lru_dump = 0;
+/** Whether startup should be blocked until buffer pool is fully restored */
+UNIV_INTERN ibool srv_blocking_lru_restore;
+
/* structure to pass status variables to MySQL */
UNIV_INTERN export_struc export_vars;
@@ -2167,6 +2176,8 @@ srv_export_innodb_status(void)
export_vars.innodb_buffer_pool_wait_free = srv_buf_pool_wait_free;
export_vars.innodb_buffer_pool_pages_flushed = srv_buf_pool_flushed;
export_vars.innodb_buffer_pool_reads = srv_buf_pool_reads;
+ export_vars.innodb_buffer_pool_read_ahead_rnd
+ = buf_pool->stat.n_ra_pages_read_rnd;
export_vars.innodb_buffer_pool_read_ahead
= buf_pool->stat.n_ra_pages_read;
export_vars.innodb_buffer_pool_read_ahead_evicted
@@ -2499,6 +2510,12 @@ srv_error_monitor_thread(
ulint fatal_cnt = 0;
ib_uint64_t old_lsn;
ib_uint64_t new_lsn;
+ /* longest waiting thread for a semaphore */
+ os_thread_id_t waiter = os_thread_get_curr_id();
+ os_thread_id_t old_waiter = waiter;
+ /* the semaphore that is being waited for */
+ const void* sema = NULL;
+ const void* old_sema = NULL;
old_lsn = srv_start_lsn;
@@ -2547,7 +2564,8 @@ loop:
sync_arr_wake_threads_if_sema_free();
- if (sync_array_print_long_waits()) {
+ if (sync_array_print_long_waits(&waiter, &sema)
+ && sema == old_sema && os_thread_eq(waiter, old_waiter)) {
fatal_cnt++;
if (fatal_cnt > 10) {
@@ -2562,6 +2580,38 @@ loop:
}
} else {
fatal_cnt = 0;
+ old_waiter = waiter;
+ old_sema = sema;
+ }
+
+ if (srv_kill_idle_transaction && trx_sys) {
+ trx_t* trx;
+ time_t now;
+rescan_idle:
+ now = time(NULL);
+ mutex_enter(&kernel_mutex);
+ trx = UT_LIST_GET_FIRST(trx_sys->mysql_trx_list);
+ while (trx) {
+ if (trx->conc_state == TRX_ACTIVE
+ && trx->mysql_thd
+ && innobase_thd_is_idle(trx->mysql_thd)) {
+ ib_int64_t start_time; /* as stmt ID */
+
+ start_time = innobase_thd_get_start_time(trx->mysql_thd);
+ if (trx->last_stmt_start != start_time) {
+ trx->idle_start = now;
+ trx->last_stmt_start = start_time;
+ } else if (difftime(now, trx->idle_start)
+ > srv_kill_idle_transaction) {
+ /* kill the session */
+ mutex_exit(&kernel_mutex);
+ innobase_thd_kill(trx->mysql_thd);
+ goto rescan_idle;
+ }
+ }
+ trx = UT_LIST_GET_NEXT(mysql_trx_list, trx);
+ }
+ mutex_exit(&kernel_mutex);
}
/* Flush stderr so that a database user gets the output
@@ -2607,7 +2657,9 @@ srv_LRU_dump_restore_thread(
os_thread_pf(os_thread_get_curr_id()));
#endif
- if (srv_auto_lru_dump)
+ /* If srv_blocking_lru_restore is TRUE, restore will be done
+ synchronously on startup. */
+ if (srv_auto_lru_dump && !srv_blocking_lru_restore)
buf_LRU_file_restore();
last_dump_time = time(NULL);
diff --git a/storage/xtradb/srv/srv0start.c b/storage/xtradb/srv/srv0start.c
index d002a1bb682..2bb122ea91c 100644
--- a/storage/xtradb/srv/srv0start.c
+++ b/storage/xtradb/srv/srv0start.c
@@ -88,6 +88,7 @@ Created 2/16/1996 Heikki Tuuri
# include "thr0loc.h"
# include "os0sync.h" /* for INNODB_RW_LOCKS_USE_ATOMICS */
# include "zlib.h" /* for ZLIB_VERSION */
+# include "buf0lru.h" /* for buf_LRU_file_restore() */
/** Log sequence number immediately after startup */
UNIV_INTERN ib_uint64_t srv_start_lsn;
@@ -126,9 +127,9 @@ static mutex_t ios_mutex;
static ulint ios;
/** io_handler_thread parameters for thread identification */
-static ulint n[SRV_MAX_N_IO_THREADS + 7 + 64];
+static ulint n[SRV_MAX_N_IO_THREADS + 7 + UNIV_MAX_PARALLELISM];
/** io_handler_thread identifiers */
-static os_thread_id_t thread_ids[SRV_MAX_N_IO_THREADS + 7 + 64];
+static os_thread_id_t thread_ids[SRV_MAX_N_IO_THREADS + 7 + UNIV_MAX_PARALLELISM];
/** We use this mutex to test the return value of pthread_mutex_trylock
on successful locking. HP-UX does NOT return 0, though Linux et al do. */
@@ -1200,6 +1201,12 @@ innobase_start_or_create_for_mysql(void)
);
#endif
+#ifdef UNIV_BLOB_DEBUG
+ fprintf(stderr,
+ "InnoDB: !!!!!!!! UNIV_BLOB_DEBUG switched on !!!!!!!!!\n"
+ "InnoDB: Server restart may fail with UNIV_BLOB_DEBUG\n");
+#endif /* UNIV_BLOB_DEBUG */
+
#ifdef UNIV_SYNC_DEBUG
fprintf(stderr,
"InnoDB: !!!!!!!! UNIV_SYNC_DEBUG switched on !!!!!!!!!\n");
@@ -1741,8 +1748,6 @@ innobase_start_or_create_for_mysql(void)
Note that this is not as heavy weight as it seems. At
this point there will be only ONE page in the buf_LRU
and there must be no page in the buf_flush list. */
- /* buffer_pool_shm should not be reused when recovery was needed. */
- if (!srv_buffer_pool_shm_is_reused)
buf_pool_invalidate();
/* We always try to do a recovery, even if the database had
@@ -1872,6 +1877,11 @@ innobase_start_or_create_for_mysql(void)
os_thread_create(&srv_LRU_dump_restore_thread, NULL,
thread_ids + 5 + SRV_MAX_N_IO_THREADS);
+ /* If srv_blocking_lru_restore is TRUE, load buffer pool contents
+ synchronously */
+ if (srv_auto_lru_dump && srv_blocking_lru_restore)
+ buf_LRU_file_restore();
+
srv_is_being_started = FALSE;
if (trx_doublewrite == NULL) {
diff --git a/storage/xtradb/sync/sync0arr.c b/storage/xtradb/sync/sync0arr.c
index 57a288089c7..4e788b4a968 100644
--- a/storage/xtradb/sync/sync0arr.c
+++ b/storage/xtradb/sync/sync0arr.c
@@ -913,8 +913,10 @@ Prints warnings of long semaphore waits to stderr.
@return TRUE if fatal semaphore wait threshold was exceeded */
UNIV_INTERN
ibool
-sync_array_print_long_waits(void)
-/*=============================*/
+sync_array_print_long_waits(
+/*========================*/
+ os_thread_id_t* waiter, /*!< out: longest waiting thread */
+ const void** sema) /*!< out: longest-waited-for semaphore */
{
sync_cell_t* cell;
ibool old_val;
@@ -922,24 +924,40 @@ sync_array_print_long_waits(void)
ulint i;
ulint fatal_timeout = srv_fatal_semaphore_wait_threshold;
ibool fatal = FALSE;
+ double longest_diff = 0;
for (i = 0; i < sync_primary_wait_array->n_cells; i++) {
+ double diff;
+ void* wait_object;
+
cell = sync_array_get_nth_cell(sync_primary_wait_array, i);
- if (cell->wait_object != NULL && cell->waiting
- && difftime(time(NULL), cell->reservation_time) > 240) {
+ wait_object = cell->wait_object;
+
+ if (wait_object == NULL || !cell->waiting) {
+
+ continue;
+ }
+
+ diff = difftime(time(NULL), cell->reservation_time);
+
+ if (diff > 240) {
fputs("InnoDB: Warning: a long semaphore wait:\n",
stderr);
sync_array_cell_print(stderr, cell);
noticed = TRUE;
}
- if (cell->wait_object != NULL && cell->waiting
- && difftime(time(NULL), cell->reservation_time)
- > fatal_timeout) {
+ if (diff > fatal_timeout) {
fatal = TRUE;
}
+
+ if (diff > longest_diff) {
+ longest_diff = diff;
+ *sema = wait_object;
+ *waiter = cell->thread;
+ }
}
if (noticed) {
diff --git a/storage/xtradb/sync/sync0rw.c b/storage/xtradb/sync/sync0rw.c
index 9431de15fda..fe000e7d008 100644
--- a/storage/xtradb/sync/sync0rw.c
+++ b/storage/xtradb/sync/sync0rw.c
@@ -261,6 +261,9 @@ rw_lock_create_func(
contains garbage at initialization and cannot be used for
recursive x-locking. */
lock->recursive = FALSE;
+ /* Silence Valgrind when UNIV_DEBUG_VALGRIND is not enabled. */
+ memset((void*) &lock->writer_thread, 0, sizeof lock->writer_thread);
+ UNIV_MEM_INVALID(&lock->writer_thread, sizeof lock->writer_thread);
#ifdef UNIV_SYNC_DEBUG
UT_LIST_INIT(lock->debug_list);
@@ -762,7 +765,9 @@ rw_lock_add_debug_info(
rw_lock_debug_mutex_exit();
if ((pass == 0) && (lock_type != RW_LOCK_WAIT_EX)) {
- sync_thread_add_level(lock, lock->level);
+ sync_thread_add_level(lock, lock->level,
+ lock_type == RW_LOCK_EX
+ && lock->lock_word < 0);
}
}
diff --git a/storage/xtradb/sync/sync0sync.c b/storage/xtradb/sync/sync0sync.c
index 3a80da9318b..277a53e4fb2 100644
--- a/storage/xtradb/sync/sync0sync.c
+++ b/storage/xtradb/sync/sync0sync.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2008, Google Inc.
Portions of this file contain modifications contributed and copyrighted by
@@ -668,7 +668,7 @@ mutex_set_debug_info(
ut_ad(mutex);
ut_ad(file_name);
- sync_thread_add_level(mutex, mutex->level);
+ sync_thread_add_level(mutex, mutex->level, FALSE);
mutex->file_name = file_name;
mutex->line = line;
@@ -1094,8 +1094,9 @@ void
sync_thread_add_level(
/*==================*/
void* latch, /*!< in: pointer to a mutex or an rw-lock */
- ulint level) /*!< in: level in the latching order; if
+ ulint level, /*!< in: level in the latching order; if
SYNC_LEVEL_VARYING, nothing is done */
+ ibool relock) /*!< in: TRUE if re-entering an x-lock */
{
sync_level_t* array;
sync_level_t* slot;
@@ -1143,6 +1144,10 @@ sync_thread_add_level(
array = thread_slot->levels;
+ if (relock) {
+ goto levels_ok;
+ }
+
/* NOTE that there is a problem with _NODE and _LEAF levels: if the
B-tree height changes, then a leaf can change to an internal node
or the other way around. We do not know at present if this can cause
@@ -1287,6 +1292,7 @@ sync_thread_add_level(
ut_error;
}
+levels_ok:
for (i = 0; i < SYNC_THREAD_N_LEVELS; i++) {
slot = sync_thread_levels_get_nth(array, i);
diff --git a/storage/xtradb/trx/trx0i_s.c b/storage/xtradb/trx/trx0i_s.c
index e148234888b..5cc9df2d5c4 100644
--- a/storage/xtradb/trx/trx0i_s.c
+++ b/storage/xtradb/trx/trx0i_s.c
@@ -504,7 +504,7 @@ fill_trx_row(
query[stmt_len] = '\0';
row->trx_query = ha_storage_put_memlim(
- cache->storage, stmt, stmt_len + 1,
+ cache->storage, query, stmt_len + 1,
MAX_ALLOWED_FOR_STORAGE(cache));
row->trx_query_cs = innobase_get_charset(trx->mysql_thd);
diff --git a/storage/xtradb/trx/trx0rec.c b/storage/xtradb/trx/trx0rec.c
index 71629f01d73..a7a393d31c8 100644
--- a/storage/xtradb/trx/trx0rec.c
+++ b/storage/xtradb/trx/trx0rec.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -1590,6 +1590,10 @@ trx_undo_prev_version_build(
return(DB_ERROR);
}
+# if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+ ut_a(!rec_offs_any_null_extern(rec, offsets));
+# endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
+
if (row_upd_changes_field_size_or_external(index, offsets, update)) {
ulint n_ext;
diff --git a/storage/xtradb/trx/trx0sys.c b/storage/xtradb/trx/trx0sys.c
index a9fb5bb38d8..b5bec64ee37 100644
--- a/storage/xtradb/trx/trx0sys.c
+++ b/storage/xtradb/trx/trx0sys.c
@@ -37,6 +37,7 @@ Created 3/26/1996 Heikki Tuuri
#include "trx0rseg.h"
#include "trx0undo.h"
#include "srv0srv.h"
+#include "srv0start.h"
#include "trx0purge.h"
#include "log0log.h"
#include "log0recv.h"
@@ -1872,10 +1873,12 @@ void
trx_sys_close(void)
/*===============*/
{
+ trx_t* trx;
trx_rseg_t* rseg;
read_view_t* view;
ut_ad(trx_sys != NULL);
+ ut_ad(srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS);
/* Check that all read views are closed except read view owned
by a purge. */
@@ -1907,6 +1910,13 @@ trx_sys_close(void)
mem_free(trx_doublewrite);
trx_doublewrite = NULL;
+ /* Only prepared transactions may be left in the system. Free them. */
+ ut_a(UT_LIST_GET_LEN(trx_sys->trx_list) == trx_n_prepared);
+
+ while ((trx = UT_LIST_GET_FIRST(trx_sys->trx_list)) != NULL) {
+ trx_free_prepared(trx);
+ }
+
/* There can't be any active transactions. */
rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);
diff --git a/storage/xtradb/trx/trx0trx.c b/storage/xtradb/trx/trx0trx.c
index 7ea3e09036f..1fca58d278f 100644
--- a/storage/xtradb/trx/trx0trx.c
+++ b/storage/xtradb/trx/trx0trx.c
@@ -50,6 +50,9 @@ UNIV_INTERN sess_t* trx_dummy_sess = NULL;
/** Number of transactions currently allocated for MySQL: protected by
the kernel mutex */
UNIV_INTERN ulint trx_n_mysql_transactions = 0;
+/* Number of transactions currently in the XA PREPARED state: protected by
+the kernel mutex */
+UNIV_INTERN ulint trx_n_prepared = 0;
/*************************************************************//**
Set detailed error message for the transaction. */
@@ -111,6 +114,8 @@ trx_create(
trx->flush_log_at_trx_commit_session = 3; /* means to use innodb_flush_log_at_trx_commit value */
+ trx->fake_changes = FALSE;
+
trx->check_foreigns = TRUE;
trx->check_unique_secondary = TRUE;
@@ -134,6 +139,9 @@ trx_create(
trx->mysql_relay_log_file_name = "";
trx->mysql_relay_log_pos = 0;
+ trx->idle_start = 0;
+ trx->last_stmt_start = 0;
+
mutex_create(&trx->undo_mutex, SYNC_TRX_UNDO);
trx->rseg = NULL;
@@ -354,6 +362,60 @@ trx_free(
}
/********************************************************************//**
+At shutdown, frees a transaction object that is in the PREPARED state. */
+UNIV_INTERN
+void
+trx_free_prepared(
+/*==============*/
+ trx_t* trx) /*!< in, own: trx object */
+{
+ ut_ad(mutex_own(&kernel_mutex));
+ ut_a(trx->conc_state == TRX_PREPARED);
+ ut_a(trx->magic_n == TRX_MAGIC_N);
+
+ /* Prepared transactions are sort of active; they allow
+ ROLLBACK and COMMIT operations. Because the system does not
+ contain any other transactions than prepared transactions at
+ the shutdown stage and because a transaction cannot become
+ PREPARED while holding locks, it is safe to release the locks
+ held by PREPARED transactions here at shutdown.*/
+ lock_release_off_kernel(trx);
+
+ trx_undo_free_prepared(trx);
+
+ mutex_free(&trx->undo_mutex);
+
+ if (trx->undo_no_arr) {
+ trx_undo_arr_free(trx->undo_no_arr);
+ }
+
+ ut_a(UT_LIST_GET_LEN(trx->signals) == 0);
+ ut_a(UT_LIST_GET_LEN(trx->reply_signals) == 0);
+
+ ut_a(trx->wait_lock == NULL);
+ ut_a(UT_LIST_GET_LEN(trx->wait_thrs) == 0);
+
+ ut_a(!trx->has_search_latch);
+
+ ut_a(trx->dict_operation_lock_mode == 0);
+
+ if (trx->lock_heap) {
+ mem_heap_free(trx->lock_heap);
+ }
+
+ if (trx->global_read_view_heap) {
+ mem_heap_free(trx->global_read_view_heap);
+ }
+
+ ut_a(ib_vector_is_empty(trx->autoinc_locks));
+ ib_vector_free(trx->autoinc_locks);
+
+ UT_LIST_REMOVE(trx_list, trx_sys->trx_list, trx);
+
+ mem_free(trx);
+}
+
+/********************************************************************//**
Frees a transaction object for MySQL. */
UNIV_INTERN
void
@@ -495,6 +557,7 @@ trx_lists_init_at_db_start(void)
if (srv_force_recovery == 0) {
trx->conc_state = TRX_PREPARED;
+ trx_n_prepared++;
} else {
fprintf(stderr,
"InnoDB: Since"
@@ -573,6 +636,7 @@ trx_lists_init_at_db_start(void)
trx->conc_state
= TRX_PREPARED;
+ trx_n_prepared++;
} else {
fprintf(stderr,
"InnoDB: Since"
@@ -878,6 +942,11 @@ trx_commit_off_kernel(
|| trx->conc_state == TRX_PREPARED);
ut_ad(mutex_own(&kernel_mutex));
+ if (UNIV_UNLIKELY(trx->conc_state == TRX_PREPARED)) {
+ ut_a(trx_n_prepared > 0);
+ trx_n_prepared--;
+ }
+
/* The following assignment makes the transaction committed in memory
and makes its changes to data visible to other transactions.
NOTE that there is a small discrepancy from the strict formal
@@ -1945,6 +2014,7 @@ trx_prepare_off_kernel(
/*--------------------------------------*/
trx->conc_state = TRX_PREPARED;
+ trx_n_prepared++;
/*--------------------------------------*/
if (lsn) {
@@ -2127,10 +2197,11 @@ trx_get_trx_by_xid(
while (trx) {
/* Compare two X/Open XA transaction id's: their
length should be the same and binary comparison
- of gtrid_lenght+bqual_length bytes should be
+ of gtrid_length+bqual_length bytes should be
the same */
- if (trx->conc_state == TRX_PREPARED
+ if (trx->is_recovered
+ && trx->conc_state == TRX_PREPARED
&& xid->gtrid_length == trx->xid.gtrid_length
&& xid->bqual_length == trx->xid.bqual_length
&& memcmp(xid->data, trx->xid.data,
diff --git a/storage/xtradb/trx/trx0undo.c b/storage/xtradb/trx/trx0undo.c
index 9ed83b5d5c1..ec1cd2d2c43 100644
--- a/storage/xtradb/trx/trx0undo.c
+++ b/storage/xtradb/trx/trx0undo.c
@@ -36,6 +36,7 @@ Created 3/26/1996 Heikki Tuuri
#include "trx0rseg.h"
#include "trx0trx.h"
#include "srv0srv.h"
+#include "srv0start.h"
#include "trx0rec.h"
#include "trx0purge.h"
@@ -2014,4 +2015,28 @@ trx_undo_insert_cleanup(
mutex_exit(&(rseg->mutex));
}
+
+/********************************************************************//**
+At shutdown, frees the undo logs of a PREPARED transaction. */
+UNIV_INTERN
+void
+trx_undo_free_prepared(
+/*===================*/
+ trx_t* trx) /*!< in/out: PREPARED transaction */
+{
+ ut_ad(srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS);
+
+ if (trx->update_undo) {
+ ut_a(trx->update_undo->state == TRX_UNDO_PREPARED);
+ UT_LIST_REMOVE(undo_list, trx->rseg->update_undo_list,
+ trx->update_undo);
+ trx_undo_mem_free(trx->update_undo);
+ }
+ if (trx->insert_undo) {
+ ut_a(trx->insert_undo->state == TRX_UNDO_PREPARED);
+ UT_LIST_REMOVE(undo_list, trx->rseg->insert_undo_list,
+ trx->insert_undo);
+ trx_undo_mem_free(trx->insert_undo);
+ }
+}
#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/ut/ut0mem.c b/storage/xtradb/ut/ut0mem.c
index bf55e4273b6..95fb2187b79 100644
--- a/storage/xtradb/ut/ut0mem.c
+++ b/storage/xtradb/ut/ut0mem.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -489,53 +489,6 @@ ut_strlcpy_rev(
return(src_size);
}
-/**********************************************************************//**
-Make a quoted copy of a NUL-terminated string. Leading and trailing
-quotes will not be included; only embedded quotes will be escaped.
-See also ut_strlenq() and ut_memcpyq().
-@return pointer to end of dest */
-UNIV_INTERN
-char*
-ut_strcpyq(
-/*=======*/
- char* dest, /*!< in: output buffer */
- char q, /*!< in: the quote character */
- const char* src) /*!< in: null-terminated string */
-{
- while (*src) {
- if ((*dest++ = *src++) == q) {
- *dest++ = q;
- }
- }
-
- return(dest);
-}
-
-/**********************************************************************//**
-Make a quoted copy of a fixed-length string. Leading and trailing
-quotes will not be included; only embedded quotes will be escaped.
-See also ut_strlenq() and ut_strcpyq().
-@return pointer to end of dest */
-UNIV_INTERN
-char*
-ut_memcpyq(
-/*=======*/
- char* dest, /*!< in: output buffer */
- char q, /*!< in: the quote character */
- const char* src, /*!< in: string to be quoted */
- ulint len) /*!< in: length of src */
-{
- const char* srcend = src + len;
-
- while (src < srcend) {
- if ((*dest++ = *src++) == q) {
- *dest++ = q;
- }
- }
-
- return(dest);
-}
-
#ifndef UNIV_HOTBACKUP
/**********************************************************************//**
Return the number of times s2 occurs in s1. Overlapping instances of s2