summaryrefslogtreecommitdiff
path: root/storage/xtradb
diff options
context:
space:
mode:
authorunknown <knielsen@knielsen-hq.org>2010-08-04 10:39:53 +0200
committerunknown <knielsen@knielsen-hq.org>2010-08-04 10:39:53 +0200
commit3f1c763a94cd25c4810692983f6e61b528e21268 (patch)
treed8f9632933303f9bdf8cdc875613ca99f51ecbcb /storage/xtradb
parenta74d04671d23fa65bd610184965cc2974ec41cf9 (diff)
parent0c6afe17dceb926b5c757157308188e540caec1e (diff)
downloadmariadb-git-3f1c763a94cd25c4810692983f6e61b528e21268.tar.gz
Merge XtraDB from Percona-Server-5.1.47-11 into MariaDB.
Diffstat (limited to 'storage/xtradb')
-rw-r--r--storage/xtradb/CMakeLists.txt22
-rw-r--r--storage/xtradb/ChangeLog271
-rw-r--r--storage/xtradb/Makefile.am2
-rw-r--r--storage/xtradb/btr/btr0btr.c139
-rw-r--r--storage/xtradb/btr/btr0cur.c56
-rw-r--r--storage/xtradb/btr/btr0pcur.c27
-rw-r--r--storage/xtradb/buf/buf0buddy.c9
-rw-r--r--storage/xtradb/buf/buf0buf.c101
-rw-r--r--storage/xtradb/buf/buf0flu.c284
-rw-r--r--storage/xtradb/buf/buf0lru.c141
-rw-r--r--storage/xtradb/buf/buf0rea.c54
-rw-r--r--storage/xtradb/data/data0data.c15
-rw-r--r--storage/xtradb/dict/dict0boot.c10
-rw-r--r--storage/xtradb/dict/dict0crea.c128
-rw-r--r--storage/xtradb/dict/dict0dict.c89
-rw-r--r--storage/xtradb/dict/dict0load.c119
-rw-r--r--storage/xtradb/dict/dict0mem.c2
-rw-r--r--storage/xtradb/fil/fil0fil.c90
-rw-r--r--storage/xtradb/fsp/fsp0fsp.c46
-rw-r--r--storage/xtradb/ha/ha0ha.c16
-rw-r--r--storage/xtradb/ha/hash0hash.c6
-rw-r--r--storage/xtradb/handler/ha_innodb.cc1144
-rw-r--r--storage/xtradb/handler/ha_innodb.h37
-rw-r--r--storage/xtradb/handler/handler0alter.cc97
-rw-r--r--storage/xtradb/handler/i_s.cc77
-rw-r--r--storage/xtradb/handler/innodb_patch_info.h2
-rw-r--r--storage/xtradb/ibuf/ibuf0ibuf.c37
-rw-r--r--storage/xtradb/include/btr0btr.h14
-rw-r--r--storage/xtradb/include/btr0btr.ic6
-rw-r--r--storage/xtradb/include/btr0cur.h20
-rw-r--r--storage/xtradb/include/btr0pcur.h52
-rw-r--r--storage/xtradb/include/btr0pcur.ic49
-rw-r--r--storage/xtradb/include/buf0buf.h38
-rw-r--r--storage/xtradb/include/buf0buf.ic15
-rw-r--r--storage/xtradb/include/buf0flu.h32
-rw-r--r--storage/xtradb/include/data0type.ic6
-rw-r--r--storage/xtradb/include/dict0boot.h3
-rw-r--r--storage/xtradb/include/dict0dict.h21
-rw-r--r--storage/xtradb/include/dict0mem.h24
-rw-r--r--storage/xtradb/include/fil0fil.h9
-rw-r--r--storage/xtradb/include/hash0hash.h5
-rw-r--r--storage/xtradb/include/hash0hash.ic20
-rw-r--r--storage/xtradb/include/lock0lock.h12
-rw-r--r--storage/xtradb/include/log0log.h31
-rw-r--r--storage/xtradb/include/log0log.ic11
-rw-r--r--storage/xtradb/include/log0recv.h14
-rw-r--r--storage/xtradb/include/mem0dbg.h9
-rw-r--r--storage/xtradb/include/mem0dbg.ic5
-rw-r--r--storage/xtradb/include/mem0mem.h5
-rw-r--r--storage/xtradb/include/mem0mem.ic10
-rw-r--r--storage/xtradb/include/mtr0log.ic5
-rw-r--r--storage/xtradb/include/mtr0mtr.ic5
-rw-r--r--storage/xtradb/include/os0file.h19
-rw-r--r--storage/xtradb/include/que0que.h13
-rw-r--r--storage/xtradb/include/que0que.ic16
-rw-r--r--storage/xtradb/include/row0mysql.h23
-rw-r--r--storage/xtradb/include/row0sel.h13
-rw-r--r--storage/xtradb/include/srv0srv.h65
-rw-r--r--storage/xtradb/include/sync0rw.h5
-rw-r--r--storage/xtradb/include/sync0sync.h26
-rw-r--r--storage/xtradb/include/trx0rseg.h2
-rw-r--r--storage/xtradb/include/trx0sys.h29
-rw-r--r--storage/xtradb/include/trx0sys.ic34
-rw-r--r--storage/xtradb/include/trx0trx.h81
-rw-r--r--storage/xtradb/include/trx0types.h9
-rw-r--r--storage/xtradb/include/univ.i20
-rw-r--r--storage/xtradb/include/ut0rbt.h309
-rw-r--r--storage/xtradb/include/ut0rnd.ic1
-rw-r--r--storage/xtradb/lock/lock0lock.c267
-rw-r--r--storage/xtradb/log/log0log.c23
-rw-r--r--storage/xtradb/log/log0recv.c85
-rw-r--r--storage/xtradb/mem/mem0dbg.c6
-rw-r--r--storage/xtradb/mem/mem0mem.c20
-rw-r--r--storage/xtradb/os/os0file.c32
-rw-r--r--storage/xtradb/page/page0page.c50
-rw-r--r--storage/xtradb/page/page0zip.c5
-rw-r--r--storage/xtradb/plug.in2
-rw-r--r--storage/xtradb/rem/rem0rec.c86
-rw-r--r--storage/xtradb/row/row0ins.c10
-rw-r--r--storage/xtradb/row/row0merge.c157
-rw-r--r--storage/xtradb/row/row0mysql.c285
-rw-r--r--storage/xtradb/row/row0row.c17
-rw-r--r--storage/xtradb/row/row0sel.c81
-rw-r--r--storage/xtradb/row/row0umod.c33
-rw-r--r--storage/xtradb/row/row0upd.c3
-rw-r--r--storage/xtradb/srv/srv0srv.c248
-rw-r--r--storage/xtradb/srv/srv0start.c250
-rw-r--r--storage/xtradb/sync/sync0sync.c87
-rw-r--r--storage/xtradb/trx/trx0i_s.c13
-rw-r--r--storage/xtradb/trx/trx0rec.c24
-rw-r--r--storage/xtradb/trx/trx0rseg.c2
-rw-r--r--storage/xtradb/trx/trx0sys.c270
-rw-r--r--storage/xtradb/trx/trx0trx.c5
-rw-r--r--storage/xtradb/ut/ut0rbt.c1249
94 files changed, 5781 insertions, 1636 deletions
diff --git a/storage/xtradb/CMakeLists.txt b/storage/xtradb/CMakeLists.txt
index 8bdc029329d..759bb525bdf 100644
--- a/storage/xtradb/CMakeLists.txt
+++ b/storage/xtradb/CMakeLists.txt
@@ -80,21 +80,11 @@ SET(XTRADB_SOURCES btr/btr0btr.c btr/btr0cur.c btr/btr0pcur.c btr/btr0sea.c
trx/trx0i_s.c trx/trx0purge.c trx/trx0rec.c trx/trx0roll.c trx/trx0rseg.c
trx/trx0sys.c trx/trx0trx.c trx/trx0undo.c
usr/usr0sess.c
- ut/ut0byte.c ut/ut0dbg.c ut/ut0mem.c ut/ut0rnd.c ut/ut0ut.c ut/ut0vec.c
+ ut/ut0byte.c ut/ut0dbg.c ut/ut0mem.c ut/ut0rbt.c ut/ut0rnd.c ut/ut0ut.c ut/ut0vec.c
ut/ut0list.c ut/ut0wqueue.c)
-ADD_DEFINITIONS(-DHAVE_WINDOWS_ATOMICS -DIB_HAVE_PAUSE_INSTRUCTION)
+# Windows atomics do not perform well. Disable Windows atomics by default.
+# See bug#52102 for details.
+#ADD_DEFINITIONS(-DHAVE_WINDOWS_ATOMICS -DINNODB_RW_LOCKS_USE_ATOMICS -DHAVE_IB_PAUSE_INSTRUCTION)
+ADD_DEFINITIONS(-DHAVE_IB_PAUSE_INSTRUCTION)
-IF (MYSQL_VERSION_ID GREATER "50137")
- MYSQL_STORAGE_ENGINE(XTRADB)
- GET_TARGET_PROPERTY(LIB_LOCATION ha_xtradb LOCATION)
- IF(LIB_LOCATION)
- SET_TARGET_PROPERTIES(ha_xtradb PROPERTIES OUTPUT_NAME ha_xtradb)
- ENDIF(LIB_LOCATION)
-ELSE (MYSQL_VERSION_ID GREATER "50137")
- IF (NOT SOURCE_SUBLIBS)
- ADD_DEFINITIONS(-D_WIN32 -DMYSQL_SERVER)
- ADD_LIBRARY(xtradb STATIC ${XTRADB_SOURCES})
- # Require mysqld_error.h, which is built as part of the GenError
- ADD_DEPENDENCIES(xtradb GenError)
- ENDIF (NOT SOURCE_SUBLIBS)
-ENDIF (MYSQL_VERSION_ID GREATER "50137")
+MYSQL_STORAGE_ENGINE(XTRADB)
diff --git a/storage/xtradb/ChangeLog b/storage/xtradb/ChangeLog
index 1a6e07fd147..bc69aaca96a 100644
--- a/storage/xtradb/ChangeLog
+++ b/storage/xtradb/ChangeLog
@@ -1,3 +1,262 @@
+2010-05-03 The InnoDB Team
+
+ * buf0buf.c:
+ Fix Bug#53248 compressed tables page checksum mismatch after
+ re-enabling innodb_checksums
+
+2010-04-28 The InnoDB Team
+
+ * log/log0recv.h, log/log0recv.c:
+ Fix Bug#53122 InnoDB recovery uses too big a hash table for redo
+ log records
+
+2010-04-27 The InnoDB Team
+
+ * handler/ha_innodb.cc, lock/lock0lock.c, row/row0mysql.c,
+ row/row0sel.c:
+ Fix Bug#48607 READ UNCOMMITTED uses more locks than READ COMMITTED
+ in InnoDB 5.1+
+
+2010-04-26 The InnoDB Team
+
+ * row/row0sel.c:
+ Fix Bug#52663 Lost update incrementing column value under
+ READ COMMITTED isolation level
+
+2010-04-22 The InnoDB Team
+
+ * include/dict0boot.h, dict/dict0boot.c:
+ Fix a bug that prevented the crash recovery of fast CREATE INDEX
+ from dropping partially created indexes.
+
+2010-04-21 The InnoDB Team
+
+ * btr/btr0btr.c:
+ Fix Bug#52964 Infinite loop in btr_page_split_and_insert()
+ in ROW_FORMAT=COMPRESSED
+
+2010-04-21 The InnoDB Team
+
+ * data/data0data.c:
+ Fix Bug#52745 Failing assertion: blob_no < page_zip->n_blobs
+
+2010-04-20 The InnoDB Team
+
+ * dict/dict0crea.c, handler/ha_innodb.cc, include/trx0trx.h:
+ Fix Bug#50495 'Row size too large' for plugin, but works for
+ built-in InnoDB
+ Only check the record size at index creation time when
+ innodb_strict_mode is set or when ROW_FORMAT is DYNAMIC or COMPRESSED.
+
+2010-04-20 The InnoDB Team
+
+ * btr/btr0btr.c, include/univ.i:
+ Implement UNIV_BTR_AVOID_COPY, for avoiding writes when a B-tree
+ node is split at the first or last record.
+
+2010-04-15 The InnoDB Team
+
+ * trx/trx0rec.c:
+ Fix Bug#52746 InnoDB purge thread crashed with table containing
+ prefix indexed blobs
+
+2010-03-31 The InnoDB Team
+
+ * mysql-test/innodb_bug51920.test, mysql-test/innodb_bug51920.result,
+ srv/srv0srv.c:
+ Fix Bug#51920 InnoDB connections in row lock wait ignore KILL
+ until lock wait timeout
+
+2010-03-31 The InnoDB Team
+
+ * mysql-test/innodb_bug38231.test:
+ Remove non-determinism in the test case.
+
+2010-03-18 The InnoDB Team
+
+ * CMakeLists.txt:
+ Fix Bug#52102 InnoDB Plugin shows performance drop compared to
+ InnoDB (Windows)
+
+2010-03-18 The InnoDB Team
+
+ * buf0buf.ic:
+ When comparing the time of the first access to a block against
+ innodb_old_blocks_time, use 32-bit arithmetics. The comparison was
+ incorrect on 64-bit systems.
+
+2010-03-11 The InnoDB Team
+
+ * buf0buf.h, buf0buf.ic:
+ Fix and clarify the latching of some buf_block_t members.
+ Note that check_index_page_at_flush is not protected by any mutex.
+ Note and assert that lock_hash_val is protected by the rw-latch.
+
+2010-03-10 The InnoDB Team
+
+ * trx/trx0sys.c:
+ Fix Bug#51653 outdated reference to set-variable
+
+2010-03-10 The InnoDB Team
+
+ * handler/ha_innodb.cc, mysql-test/innodb_bug21704.result,
+ mysql-test/innodb_bug47621.result, mysql-test/innodb_bug47621.test:
+ Fix Bug#47621 MySQL and InnoDB data dictionaries will become out of
+ sync when renaming columns
+
+2010-03-10 The InnoDB Team
+
+ * handler/ha_innodb.cc:
+ Fix Bug#51356 Many Valgrind errors in error messages
+ with concurrent DDL
+
+2010-03-10 The InnoDB Team
+
+ * handler/ha_innodb.cc, handler/handler0alter.cc,
+ mysql-test/innodb_bug51378.result, mysql-test/innodb_bug51378.test:
+ Fix Bug#51378 Init 'ref_length' to correct value, in case an out
+ of bound MySQL primary_key
+
+2010-03-10 The InnoDB Team
+
+ * log/log0recv.c:
+ Remove a bogus assertion about page numbers exceeding 0x90000000
+ in the redo log. Abort when encountering a corrupted redo log
+ record, unless innodb_force_recovery is set.
+
+2010-03-09 The InnoDB Team
+
+ * handler/ha_innodb.cc:
+ Make SHOW ENGINE INNODB MUTEX STATUS display SUM(os_waits)
+ for the buffer pool block mutexes and locks.
+
+2010-03-08 The InnoDB Team
+
+ * fil/fil0fil.c:
+ Fix ALTER TABLE ... IMPORT TABLESPACE of compressed tables.
+
+2010-03-03 The InnoDB Team
+
+ * handler/handler0alter.cc, innodb-index.result, innodb-index.test,
+ innodb.result, innodb.test:
+ Disallow a duplicate index name when creating an index.
+
+2010-02-11 The InnoDB Team
+
+ * include/mem0mem.h, include/mem0mem.ic, mem/mem0mem.c:
+ Fix Bug#49535 Available memory check slows down crash
+ recovery tens of times
+
+2010-02-09 The InnoDB Team
+
+ * buf/buf0buf.c:
+ Fix Bug#38901 InnoDB logs error repeatedly when trying to load
+ page into buffer pool
+
+2010-02-09 The InnoDB Team
+
+ * srv/srv0srv.c:
+ Let the master thread sleep if the amount of work to be done is
+ calibrated as taking less than a second.
+
+2010-02-04 The InnoDB Team
+
+ * btr/btr0btr.c, btr/btr0cur.c, btr/btr0pcur.c, buf/buf0buf.c,
+ include/btr0btr.h, include/btr0cur.h, include/btr0pcur.h,
+ include/btr0pcur.ic, include/buf0buf.h, row/row0ins.c, row/row0sel.c:
+ Pass the file name and line number of the caller of the
+ b-tree cursor functions to the buffer pool requests, in order
+ to make the latch diagnostics more accurate.
+
+2010-02-03 The InnoDB Team
+
+ * lock/lock0lock.c:
+ Fix Bug#49001 SHOW INNODB STATUS deadlock info incorrect
+ when deadlock detection aborts
+
+2010-02-03 The InnoDB Team
+
+ * buf/buf0lru.c:
+ Fix Bug#35077 Very slow DROP TABLE (ALTER TABLE, OPTIMIZE TABLE)
+ on compressed tables
+
+2010-02-03 The InnoDB Team
+
+ * handler/ha_innodb.cc, include/row0mysql.h, row/row0mysql.c:
+ Clean up CHECK TABLE error handling.
+
+2010-02-01 The InnoDB Team
+
+ * handler/ha_innodb.cc, mysql-test/innodb-autoinc.test,
+ mysql-test/innodb-autoinc.result,
+ mysql-test/innodb-autoinc-44030.test,
+ mysql-test/innodb-autoinc-44030.result:
+ Fix Bug#49497 Error 1467 (ER_AUTOINC_READ_FAILED) on inserting
+ a negative value
+
+2010-01-27 The InnoDB Team
+
+ * include/row0mysql.h, log/log0recv.c, row/row0mysql.c:
+ Drop temporary tables at startup.
+ This addresses the third aspect of
+ Bug#41609 Crash recovery does not work for InnoDB temporary tables.
+
+2010-01-21 The InnoDB Team
+
+ * buf/buf0buf.c:
+ Do not merge buffered inserts to compressed pages before
+ the redo log has been applied in crash recovery.
+
+2010-01-13 The InnoDB Team
+
+ * row/row0sel.c:
+ On the READ UNCOMMITTED isolation level, do not attempt to access
+ a clustered index record that has been marked for deletion. The
+ built-in InnoDB in MySQL 5.1 and earlier would attempt to retrieve
+ a previous version of the record in this case.
+
+2010-01-13 The InnoDB Team
+
+ * buf/buf0buf.c:
+ When disabling the adaptive hash index, check the block state
+ before checking block->is_hashed, because the latter may be
+ uninitialized right after server startup.
+
+2010-01-12 The InnoDB Team
+
+ * handler/ha_innodb.cc, handler/ha_innodb.h:
+ Fix Bug#46193 crash when accessing tables after enabling
+ innodb_force_recovery option
+
+2010-01-12 The InnoDB Team
+
+ * row/row0mysql.c:
+ Fix Bug#49238 Creating/Dropping a temporary table while at 1023
+ transactions will cause assert.
+
+2009-12-02 The InnoDB Team
+
+ * srv/srv0start.c:
+ Display the zlib version number at startup.
+ InnoDB compressed tables use zlib, and the implementation depends
+ on the zlib function compressBound(), whose definition was slightly
+ changed in zlib version 1.2.3.1 in 2006. MySQL bundles zlib 1.2.3
+ from 2005, but some installations use a more recent zlib.
+
+2009-11-30 The InnoDB Team
+
+ * dict/dict0crea.c, dict/dict0mem.c, dict/dict0load.c,
+ dict/dict0boot.c, fil/fil0fil.c, handler/ha_innodb.cc,
+ include/dict0mem.h, row/row0mysql.c:
+ Fix the bogus warning messages for non-existing temporary
+ tables that were reported in
+ Bug#41609 Crash recovery does not work for InnoDB temporary tables.
+ The actual crash recovery bug was corrected on 2009-04-29.
+
+2009-11-27 The InnoDB Team
+
+ InnoDB Plugin 1.0.6 released
+
2009-11-20 The InnoDB Team
* handler/ha_innodb.cc:
@@ -79,8 +338,8 @@
sync/sync0arr.c, sync/sync0sync.c, thr/thr0loc.c, trx/trx0i_s.c,
trx/trx0purge.c, trx/trx0rseg.c, trx/trx0sys.c, trx/trx0undo.c,
usr/usr0sess.c, ut/ut0mem.c:
- Fix Bug #45992 innodb memory not freed after shutdown
- Fix Bug #46656 InnoDB plugin: memory leaks (Valgrind)
+ Fix Bug#45992 innodb memory not freed after shutdown
+ Fix Bug#46656 InnoDB plugin: memory leaks (Valgrind)
2009-10-29 The InnoDB Team
@@ -422,7 +681,7 @@
* dict/dict0dict.c:
When an index column cannot be found in the table during index
creation, display additional diagnostic before an assertion failure.
- This does NOT fix Bug #44571 InnoDB Plugin crashes on ADD INDEX,
+ This does NOT fix Bug#44571 InnoDB Plugin crashes on ADD INDEX,
but it helps understand the reason of the crash.
2009-06-17 The InnoDB Team
@@ -535,6 +794,12 @@
Fix Bug#44320 InnoDB: missing DB_ROLL_PTR in Table Monitor COLUMNS
output
+2009-04-29 The InnoDB Team
+
+ * fil/fil0fil.c, include/fil0fil.h, include/mtr0mtr.h,
+ log/log0recv.c:
+ Fix Bug#41609 Crash recovery does not work for InnoDB temporary tables
+
2009-04-23 The InnoDB Team
* row/row0mysql.c:
diff --git a/storage/xtradb/Makefile.am b/storage/xtradb/Makefile.am
index 36d250890f5..b0b4721a9c5 100644
--- a/storage/xtradb/Makefile.am
+++ b/storage/xtradb/Makefile.am
@@ -217,6 +217,7 @@ noinst_HEADERS= \
include/ut0lst.h \
include/ut0mem.h \
include/ut0mem.ic \
+ include/ut0rbt.h \
include/ut0rnd.h \
include/ut0rnd.ic \
include/ut0sort.h \
@@ -319,6 +320,7 @@ libxtradb_a_SOURCES= \
ut/ut0dbg.c \
ut/ut0list.c \
ut/ut0mem.c \
+ ut/ut0rbt.c \
ut/ut0rnd.c \
ut/ut0ut.c \
ut/ut0vec.c \
diff --git a/storage/xtradb/btr/btr0btr.c b/storage/xtradb/btr/btr0btr.c
index 520c18553ea..12e57dcc490 100644
--- a/storage/xtradb/btr/btr0btr.c
+++ b/storage/xtradb/btr/btr0btr.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -604,13 +604,15 @@ an x-latch on the tree.
@return rec_get_offsets() of the node pointer record */
static
ulint*
-btr_page_get_father_node_ptr(
-/*=========================*/
+btr_page_get_father_node_ptr_func(
+/*==============================*/
ulint* offsets,/*!< in: work area for the return value */
mem_heap_t* heap, /*!< in: memory heap to use */
btr_cur_t* cursor, /*!< in: cursor pointing to user record,
out: cursor on node pointer record,
its page x-latched */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
mtr_t* mtr) /*!< in: mtr */
{
dtuple_t* tuple;
@@ -634,7 +636,8 @@ btr_page_get_father_node_ptr(
tuple = dict_index_build_node_ptr(index, user_rec, 0, heap, level);
btr_cur_search_to_nth_level(index, level + 1, tuple, PAGE_CUR_LE,
- BTR_CONT_MODIFY_TREE, cursor, 0, mtr);
+ BTR_CONT_MODIFY_TREE, cursor, 0,
+ file, line, mtr);
node_ptr = btr_cur_get_rec(cursor);
ut_ad(!page_rec_is_comp(node_ptr)
@@ -682,6 +685,9 @@ btr_page_get_father_node_ptr(
return(offsets);
}
+#define btr_page_get_father_node_ptr(of,heap,cur,mtr) \
+ btr_page_get_father_node_ptr_func(of,heap,cur,__FILE__,__LINE__,mtr)
+
/************************************************************//**
Returns the upper level node pointer to a page. It is assumed that mtr holds
an x-latch on the tree.
@@ -1475,11 +1481,11 @@ Calculates a split record such that the tuple will certainly fit on
its half-page when the split is performed. We assume in this function
only that the cursor page has at least one user record.
@return split record, or NULL if tuple will be the first record on
-upper half-page */
+the lower or upper half-page (determined by btr_page_tuple_smaller()) */
static
rec_t*
-btr_page_get_sure_split_rec(
-/*========================*/
+btr_page_get_split_rec(
+/*===================*/
btr_cur_t* cursor, /*!< in: cursor at which insert should be made */
const dtuple_t* tuple, /*!< in: tuple to insert */
ulint n_ext) /*!< in: number of externally stored columns */
@@ -1692,11 +1698,13 @@ Inserts a data tuple to a tree on a non-leaf level. It is assumed
that mtr holds an x-latch on the tree. */
UNIV_INTERN
void
-btr_insert_on_non_leaf_level(
-/*=========================*/
+btr_insert_on_non_leaf_level_func(
+/*==============================*/
dict_index_t* index, /*!< in: index */
ulint level, /*!< in: level, must be > 0 */
dtuple_t* tuple, /*!< in: the record to be inserted */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
mtr_t* mtr) /*!< in: mtr */
{
big_rec_t* dummy_big_rec;
@@ -1708,7 +1716,7 @@ btr_insert_on_non_leaf_level(
btr_cur_search_to_nth_level(index, level, tuple, PAGE_CUR_LE,
BTR_CONT_MODIFY_TREE,
- &cursor, 0, mtr);
+ &cursor, 0, file, line, mtr);
err = btr_cur_pessimistic_insert(BTR_NO_LOCKING_FLAG
| BTR_KEEP_SYS_FLAG
@@ -1854,6 +1862,37 @@ btr_attach_half_pages(
}
/*************************************************************//**
+Determine if a tuple is smaller than any record on the page.
+@return TRUE if smaller */
+static
+ibool
+btr_page_tuple_smaller(
+/*===================*/
+ btr_cur_t* cursor, /*!< in: b-tree cursor */
+ const dtuple_t* tuple, /*!< in: tuple to consider */
+ ulint* offsets,/*!< in/out: temporary storage */
+ ulint n_uniq, /*!< in: number of unique fields
+ in the index page records */
+ mem_heap_t** heap) /*!< in/out: heap for offsets */
+{
+ buf_block_t* block;
+ const rec_t* first_rec;
+ page_cur_t pcur;
+
+ /* Read the first user record in the page. */
+ block = btr_cur_get_block(cursor);
+ page_cur_set_before_first(block, &pcur);
+ page_cur_move_to_next(&pcur);
+ first_rec = page_cur_get_rec(&pcur);
+
+ offsets = rec_get_offsets(
+ first_rec, cursor->index, offsets,
+ n_uniq, heap);
+
+ return(cmp_dtuple_rec(tuple, first_rec, offsets) < 0);
+}
+
+/*************************************************************//**
Splits an index page to halves and inserts the tuple. It is assumed
that mtr holds an x-latch to the index tree. NOTE: the tree x-latch is
released within this function! NOTE that the operation of this
@@ -1923,12 +1962,17 @@ func_start:
/* 1. Decide the split record; split_rec == NULL means that the
tuple to be inserted should be the first record on the upper
half-page */
+ insert_left = FALSE;
if (n_iterations > 0) {
direction = FSP_UP;
hint_page_no = page_no + 1;
- split_rec = btr_page_get_sure_split_rec(cursor, tuple, n_ext);
+ split_rec = btr_page_get_split_rec(cursor, tuple, n_ext);
+ if (UNIV_UNLIKELY(split_rec == NULL)) {
+ insert_left = btr_page_tuple_smaller(
+ cursor, tuple, offsets, n_uniq, &heap);
+ }
} else if (btr_page_get_split_rec_to_right(cursor, &split_rec)) {
direction = FSP_UP;
hint_page_no = page_no + 1;
@@ -1936,37 +1980,24 @@ func_start:
} else if (btr_page_get_split_rec_to_left(cursor, &split_rec)) {
direction = FSP_DOWN;
hint_page_no = page_no - 1;
+ ut_ad(split_rec);
} else {
direction = FSP_UP;
hint_page_no = page_no + 1;
- if (page_get_n_recs(page) == 1) {
- page_cur_t pcur;
-
- /* There is only one record in the index page
- therefore we can't split the node in the middle
- by default. We need to determine whether the
- new record will be inserted to the left or right. */
-
- /* Read the first (and only) record in the page. */
- page_cur_set_before_first(block, &pcur);
- page_cur_move_to_next(&pcur);
- first_rec = page_cur_get_rec(&pcur);
+ /* If there is only one record in the index page, we
+ can't split the node in the middle by default. We need
+ to determine whether the new record will be inserted
+ to the left or right. */
- offsets = rec_get_offsets(
- first_rec, cursor->index, offsets,
- n_uniq, &heap);
-
- /* If the new record is less than the existing record
- the split in the middle will copy the existing
- record to the new node. */
- if (cmp_dtuple_rec(tuple, first_rec, offsets) < 0) {
- split_rec = page_get_middle_rec(page);
- } else {
- split_rec = NULL;
- }
- } else {
+ if (page_get_n_recs(page) > 1) {
split_rec = page_get_middle_rec(page);
+ } else if (btr_page_tuple_smaller(cursor, tuple,
+ offsets, n_uniq, &heap)) {
+ split_rec = page_rec_get_next(
+ page_get_infimum_rec(page));
+ } else {
+ split_rec = NULL;
}
}
@@ -1996,11 +2027,15 @@ func_start:
avoid further splits by inserting the record
to an empty page. */
split_rec = NULL;
- goto insert_right;
+ goto insert_empty;
}
+ } else if (UNIV_UNLIKELY(insert_left)) {
+ first_rec = page_rec_get_next(page_get_infimum_rec(page));
+ move_limit = page_rec_get_next(btr_cur_get_rec(cursor));
} else {
-insert_right:
- insert_left = FALSE;
+insert_empty:
+ ut_ad(!split_rec);
+ ut_ad(!insert_left);
buf = mem_alloc(rec_get_converted_size(cursor->index,
tuple, n_ext));
@@ -2024,7 +2059,11 @@ insert_right:
&& btr_page_insert_fits(cursor, split_rec,
offsets, tuple, n_ext, heap);
} else {
- mem_free(buf);
+ if (!insert_left) {
+ mem_free(buf);
+ buf = NULL;
+ }
+
insert_will_fit = !new_page_zip
&& btr_page_insert_fits(cursor, NULL,
NULL, tuple, n_ext, heap);
@@ -2037,7 +2076,17 @@ insert_right:
}
/* 5. Move then the records to the new page */
- if (direction == FSP_DOWN) {
+ if (direction == FSP_DOWN
+#ifdef UNIV_BTR_AVOID_COPY
+ && page_rec_is_supremum(move_limit)) {
+ /* Instead of moving all records, make the new page
+ the empty page. */
+
+ left_block = block;
+ right_block = new_block;
+ } else if (direction == FSP_DOWN
+#endif /* UNIV_BTR_AVOID_COPY */
+ ) {
/* fputs("Split left\n", stderr); */
if (0
@@ -2080,6 +2129,14 @@ insert_right:
right_block = block;
lock_update_split_left(right_block, left_block);
+#ifdef UNIV_BTR_AVOID_COPY
+ } else if (!split_rec) {
+ /* Instead of moving all records, make the new page
+ the empty page. */
+
+ left_block = new_block;
+ right_block = block;
+#endif /* UNIV_BTR_AVOID_COPY */
} else {
/* fputs("Split right\n", stderr); */
diff --git a/storage/xtradb/btr/btr0cur.c b/storage/xtradb/btr/btr0cur.c
index 06b54bc7120..0e75b4e8442 100644
--- a/storage/xtradb/btr/btr0cur.c
+++ b/storage/xtradb/btr/btr0cur.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
Copyright (c) 2008, Google Inc.
Portions of this file contain modifications contributed and copyrighted by
@@ -372,6 +372,8 @@ btr_cur_search_to_nth_level(
ulint has_search_latch,/*!< in: info on the latch mode the
caller currently has on btr_search_latch:
RW_S_LATCH, or 0 */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
mtr_t* mtr) /*!< in: mtr */
{
page_cur_t* page_cursor;
@@ -550,7 +552,7 @@ btr_cur_search_to_nth_level(
retry_page_get:
block = buf_page_get_gen(space, zip_size, page_no,
rw_latch, guess, buf_mode,
- __FILE__, __LINE__, mtr);
+ file, line, mtr);
if (block == NULL) {
if (srv_pass_corrupt_table && buf_mode != BUF_GET_IF_IN_POOL) {
page_cursor->block = 0;
@@ -727,13 +729,15 @@ func_exit:
Opens a cursor at either end of an index. */
UNIV_INTERN
void
-btr_cur_open_at_index_side(
-/*=======================*/
+btr_cur_open_at_index_side_func(
+/*============================*/
ibool from_left, /*!< in: TRUE if open to the low end,
FALSE if to the high end */
dict_index_t* index, /*!< in: index */
ulint latch_mode, /*!< in: latch mode */
btr_cur_t* cursor, /*!< in: cursor */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
mtr_t* mtr) /*!< in: mtr */
{
page_cur_t* page_cursor;
@@ -778,7 +782,7 @@ btr_cur_open_at_index_side(
page_t* page;
block = buf_page_get_gen(space, zip_size, page_no,
RW_NO_LATCH, NULL, BUF_GET,
- __FILE__, __LINE__, mtr);
+ file, line, mtr);
page = buf_block_get_frame(block);
if (srv_pass_corrupt_table && !page) {
@@ -869,11 +873,13 @@ btr_cur_open_at_index_side(
Positions a cursor at a randomly chosen position within a B-tree. */
UNIV_INTERN
void
-btr_cur_open_at_rnd_pos(
-/*====================*/
+btr_cur_open_at_rnd_pos_func(
+/*=========================*/
dict_index_t* index, /*!< in: index */
ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */
btr_cur_t* cursor, /*!< in/out: B-tree cursor */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
mtr_t* mtr) /*!< in: mtr */
{
page_cur_t* page_cursor;
@@ -908,7 +914,7 @@ btr_cur_open_at_rnd_pos(
block = buf_page_get_gen(space, zip_size, page_no,
RW_NO_LATCH, NULL, BUF_GET,
- __FILE__, __LINE__, mtr);
+ file, line, mtr);
page = buf_block_get_frame(block);
if (srv_pass_corrupt_table && !page) {
@@ -1229,7 +1235,6 @@ btr_cur_optimistic_insert(
ibool inherit;
ulint zip_size;
ulint rec_size;
- mem_heap_t* heap = NULL;
ulint err;
*big_rec = NULL;
@@ -1315,10 +1320,6 @@ btr_cur_optimistic_insert(
index, entry, big_rec_vec);
}
- if (heap) {
- mem_heap_free(heap);
- }
-
return(DB_TOO_BIG_RECORD);
}
}
@@ -1341,15 +1342,11 @@ fail_err:
dtuple_convert_back_big_rec(index, entry, big_rec_vec);
}
- if (UNIV_LIKELY_NULL(heap)) {
- mem_heap_free(heap);
- }
-
return(err);
}
if (UNIV_UNLIKELY(max_size < BTR_CUR_PAGE_REORGANIZE_LIMIT
- || max_size < rec_size)
+ || max_size < rec_size)
&& UNIV_LIKELY(page_get_n_recs(page) > 1)
&& page_get_max_insert_size(page, 1) < rec_size) {
@@ -1415,10 +1412,6 @@ fail_err:
}
}
- if (UNIV_LIKELY_NULL(heap)) {
- mem_heap_free(heap);
- }
-
#ifdef BTR_CUR_HASH_ADAPT
if (!reorg && leaf && (cursor->flag == BTR_CUR_HASH)) {
btr_search_update_hash_node_on_insert(cursor);
@@ -3282,7 +3275,8 @@ btr_estimate_n_rows_in_range(
btr_cur_search_to_nth_level(index, 0, tuple1, mode1,
BTR_SEARCH_LEAF | BTR_ESTIMATE,
- &cursor, 0, &mtr);
+ &cursor, 0,
+ __FILE__, __LINE__, &mtr);
} else {
btr_cur_open_at_index_side(TRUE, index,
BTR_SEARCH_LEAF | BTR_ESTIMATE,
@@ -3299,7 +3293,8 @@ btr_estimate_n_rows_in_range(
btr_cur_search_to_nth_level(index, 0, tuple2, mode2,
BTR_SEARCH_LEAF | BTR_ESTIMATE,
- &cursor, 0, &mtr);
+ &cursor, 0,
+ __FILE__, __LINE__, &mtr);
} else {
btr_cur_open_at_index_side(FALSE, index,
BTR_SEARCH_LEAF | BTR_ESTIMATE,
@@ -3438,7 +3433,7 @@ btr_estimate_n_pages_not_null(
btr_cur_search_to_nth_level(index, 0, tuple1, PAGE_CUR_G,
BTR_SEARCH_LEAF | BTR_ESTIMATE,
- &cursor, 0, &mtr);
+ &cursor, 0, __FILE__, __LINE__, &mtr);
mtr_commit(&mtr);
@@ -3588,9 +3583,11 @@ btr_estimate_number_of_different_key_vals(
effective_pages = btr_estimate_n_pages_not_null(index, 1 /*k*/, first_rec_path);
if (!effective_pages) {
+ dict_index_stat_mutex_enter(index);
for (j = 0; j <= n_cols; j++) {
index->stat_n_diff_key_vals[j] = (ib_int64_t)index->stat_n_leaf_pages;
}
+ dict_index_stat_mutex_exit(index);
return;
} else if (effective_pages > index->stat_n_leaf_pages) {
effective_pages = index->stat_n_leaf_pages;
@@ -3732,6 +3729,8 @@ btr_estimate_number_of_different_key_vals(
also the pages used for external storage of fields (those pages are
included in index->stat_n_leaf_pages) */
+ dict_index_stat_mutex_enter(index);
+
for (j = 0; j <= n_cols; j++) {
index->stat_n_diff_key_vals[j]
= ((n_diff[j]
@@ -3770,8 +3769,9 @@ btr_estimate_number_of_different_key_vals(
}
}
- mem_free(n_diff);
+ dict_index_stat_mutex_exit(index);
+ mem_free(n_diff);
if (UNIV_LIKELY_NULL(heap)) {
mem_heap_free(heap);
}
@@ -4639,7 +4639,7 @@ btr_free_externally_stored_field(
/* In the rollback of uncommitted transactions, we may
encounter a clustered index record whose BLOBs have
not been written. There is nothing to free then. */
- ut_a(rb_ctx == RB_RECOVERY);
+ ut_a(rb_ctx == RB_RECOVERY || rb_ctx == RB_RECOVERY_PURGE_REC);
return;
}
@@ -4685,7 +4685,7 @@ btr_free_externally_stored_field(
|| (mach_read_from_1(field_ref + BTR_EXTERN_LEN)
& BTR_EXTERN_OWNER_FLAG)
/* Rollback and inherited field */
- || (rb_ctx != RB_NONE
+ || ((rb_ctx == RB_NORMAL || rb_ctx == RB_RECOVERY)
&& (mach_read_from_1(field_ref + BTR_EXTERN_LEN)
& BTR_EXTERN_INHERITED_FLAG))) {
diff --git a/storage/xtradb/btr/btr0pcur.c b/storage/xtradb/btr/btr0pcur.c
index 86beea5a899..537c26f6bf2 100644
--- a/storage/xtradb/btr/btr0pcur.c
+++ b/storage/xtradb/btr/btr0pcur.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -211,10 +211,12 @@ record and it can be restored on a user record whose ordering fields
are identical to the ones of the original user record */
UNIV_INTERN
ibool
-btr_pcur_restore_position(
-/*======================*/
+btr_pcur_restore_position_func(
+/*===========================*/
ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */
btr_pcur_t* cursor, /*!< in: detached persistent cursor */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
mtr_t* mtr) /*!< in: mtr */
{
dict_index_t* index;
@@ -223,6 +225,9 @@ btr_pcur_restore_position(
ulint old_mode;
mem_heap_t* heap;
+ ut_ad(mtr);
+ ut_ad(mtr->state == MTR_ACTIVE);
+
index = btr_cur_get_index(btr_pcur_get_btr_cur(cursor));
if (UNIV_UNLIKELY(cursor->old_stored != BTR_PCUR_OLD_STORED)
@@ -263,7 +268,8 @@ btr_pcur_restore_position(
if (UNIV_LIKELY(buf_page_optimistic_get(
latch_mode,
cursor->block_when_stored,
- cursor->modify_clock, mtr))) {
+ cursor->modify_clock,
+ file, line, mtr))) {
cursor->pos_state = BTR_PCUR_IS_POSITIONED;
buf_block_dbg_add_level(btr_pcur_get_block(cursor),
@@ -318,8 +324,8 @@ btr_pcur_restore_position(
mode = PAGE_CUR_L;
}
- btr_pcur_open_with_no_init(index, tuple, mode, latch_mode,
- cursor, 0, mtr);
+ btr_pcur_open_with_no_init_func(index, tuple, mode, latch_mode,
+ cursor, 0, file, line, mtr);
/* Restore the old search mode */
cursor->search_mode = old_mode;
@@ -568,8 +574,8 @@ before first in tree. The latching mode must be BTR_SEARCH_LEAF or
BTR_MODIFY_LEAF. */
UNIV_INTERN
void
-btr_pcur_open_on_user_rec(
-/*======================*/
+btr_pcur_open_on_user_rec_func(
+/*===========================*/
dict_index_t* index, /*!< in: index */
const dtuple_t* tuple, /*!< in: tuple on which search done */
ulint mode, /*!< in: PAGE_CUR_L, ... */
@@ -577,9 +583,12 @@ btr_pcur_open_on_user_rec(
BTR_MODIFY_LEAF */
btr_pcur_t* cursor, /*!< in: memory buffer for persistent
cursor */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
mtr_t* mtr) /*!< in: mtr */
{
- btr_pcur_open(index, tuple, mode, latch_mode, cursor, mtr);
+ btr_pcur_open_func(index, tuple, mode, latch_mode, cursor,
+ file, line, mtr);
if ((mode == PAGE_CUR_GE) || (mode == PAGE_CUR_G)) {
diff --git a/storage/xtradb/buf/buf0buddy.c b/storage/xtradb/buf/buf0buddy.c
index d5e45745757..e4a79026d3a 100644
--- a/storage/xtradb/buf/buf0buddy.c
+++ b/storage/xtradb/buf/buf0buddy.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2006, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -430,6 +430,8 @@ buf_buddy_relocate_block(
}
mutex_exit(&flush_list_mutex);
+ UNIV_MEM_INVALID(bpage, sizeof *bpage);
+
mutex_exit(&buf_pool_zip_mutex);
mutex_exit(&zip_free_mutex);
return(TRUE);
@@ -567,7 +569,12 @@ success:
}
} else if (i == buf_buddy_get_slot(sizeof(buf_page_t))) {
/* This must be a buf_page_t object. */
+#if UNIV_WORD_SIZE == 4
+ /* On 32-bit systems, there is no padding in
+ buf_page_t. On other systems, Valgrind could complain
+ about uninitialized pad bytes. */
UNIV_MEM_ASSERT_RW(src, size);
+#endif
mutex_exit(&zip_free_mutex);
diff --git a/storage/xtradb/buf/buf0buf.c b/storage/xtradb/buf/buf0buf.c
index 79a9488e339..a5a9e1d9004 100644
--- a/storage/xtradb/buf/buf0buf.c
+++ b/storage/xtradb/buf/buf0buf.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
Copyright (c) 2008, Google Inc.
Portions of this file contain modifications contributed and copyrighted by
@@ -277,6 +277,8 @@ the read requests for the whole area.
#ifndef UNIV_HOTBACKUP
/** Value in microseconds */
static const int WAIT_FOR_READ = 5000;
+/** Number of attemtps made to read in a page in the buffer pool */
+static const ulint BUF_PAGE_READ_MAX_RETRIES = 100;
/** The buffer buf_pool of the database */
UNIV_INTERN buf_pool_t* buf_pool = NULL;
@@ -826,7 +828,7 @@ buf_chunk_init(
buf_block_init(block, frame);
-#ifdef HAVE_purify
+#ifdef HAVE_valgrind
/* Wipe contents of frame to eliminate a Purify warning */
memset(block->frame, '\0', UNIV_PAGE_SIZE);
#endif
@@ -1150,7 +1152,9 @@ buf_pool_drop_hash_index(void)
when we have an x-latch on btr_search_latch;
see the comment in buf0buf.h */
- if (!block->is_hashed) {
+ if (buf_block_get_state(block)
+ != BUF_BLOCK_FILE_PAGE
+ || !block->is_hashed) {
continue;
}
@@ -1283,8 +1287,6 @@ buf_relocate(
HASH_DELETE(buf_page_t, hash, buf_pool->page_hash, fold, bpage);
HASH_INSERT(buf_page_t, hash, buf_pool->page_hash, fold, dpage);
-
- UNIV_MEM_INVALID(bpage, sizeof *bpage);
}
/********************************************************************//**
@@ -1980,14 +1982,14 @@ buf_zip_decompress(
buf_block_t* block, /*!< in/out: block */
ibool check) /*!< in: TRUE=verify the page checksum */
{
- const byte* frame = block->page.zip.data;
+ const byte* frame = block->page.zip.data;
+ ulint stamp_checksum = mach_read_from_4(
+ frame + FIL_PAGE_SPACE_OR_CHKSUM);
ut_ad(buf_block_get_zip_size(block));
ut_a(buf_block_get_space(block) != 0);
- if (UNIV_LIKELY(check)) {
- ulint stamp_checksum = mach_read_from_4(
- frame + FIL_PAGE_SPACE_OR_CHKSUM);
+ if (UNIV_LIKELY(check && stamp_checksum != BUF_NO_CHECKSUM_MAGIC)) {
ulint calc_checksum = page_zip_calc_checksum(
frame, page_zip_get_size(&block->page.zip));
@@ -2196,6 +2198,7 @@ buf_page_get_gen(
unsigned access_time;
ulint fix_type;
ibool must_read;
+ ulint retries = 0;
mutex_t* block_mutex;
trx_t* trx = NULL;
ulint sec;
@@ -2204,6 +2207,7 @@ buf_page_get_gen(
ib_uint64_t finish_time;
ut_ad(mtr);
+ ut_ad(mtr->state == MTR_ACTIVE);
ut_ad((rw_latch == RW_S_LATCH)
|| (rw_latch == RW_X_LATCH)
|| (rw_latch == RW_NO_LATCH));
@@ -2271,7 +2275,29 @@ loop2:
return(NULL);
}
- buf_read_page(space, zip_size, offset, trx);
+ if (buf_read_page(space, zip_size, offset, trx)) {
+ retries = 0;
+ } else if (retries < BUF_PAGE_READ_MAX_RETRIES) {
+ ++retries;
+ } else {
+ fprintf(stderr, "InnoDB: Error: Unable"
+ " to read tablespace %lu page no"
+ " %lu into the buffer pool after"
+ " %lu attempts\n"
+ "InnoDB: The most probable cause"
+ " of this error may be that the"
+ " table has been corrupted.\n"
+ "InnoDB: You can try to fix this"
+ " problem by using"
+ " innodb_force_recovery.\n"
+ "InnoDB: Please see reference manual"
+ " for more details.\n"
+ "InnoDB: Aborting...\n",
+ space, offset,
+ BUF_PAGE_READ_MAX_RETRIES);
+
+ ut_error;
+ }
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
ut_a(++buf_dbg_counter % 37 || buf_validate());
@@ -2414,22 +2440,8 @@ wait_until_unfixed:
ut_ad(!block->page.in_flush_list);
} else {
/* Relocate buf_pool->flush_list. */
- buf_page_t* b;
-
- b = UT_LIST_GET_PREV(flush_list, &block->page);
- ut_ad(block->page.in_flush_list);
- UT_LIST_REMOVE(flush_list, buf_pool->flush_list,
- &block->page);
-
- if (b) {
- UT_LIST_INSERT_AFTER(
- flush_list, buf_pool->flush_list, b,
- &block->page);
- } else {
- UT_LIST_ADD_FIRST(
- flush_list, buf_pool->flush_list,
- &block->page);
- }
+ buf_flush_relocate_on_flush_list(bpage,
+ &block->page);
}
mutex_exit(&flush_list_mutex);
@@ -2447,6 +2459,9 @@ wait_until_unfixed:
block->page.buf_fix_count = 1;
buf_block_set_io_fix(block, BUF_IO_READ);
rw_lock_x_lock(&block->lock);
+
+ UNIV_MEM_INVALID(bpage, sizeof *bpage);
+
mutex_exit(block_mutex);
mutex_exit(&buf_pool_zip_mutex);
@@ -2461,8 +2476,9 @@ wait_until_unfixed:
/* Decompress the page and apply buffered operations
while not holding buf_pool_mutex or block->mutex. */
success = buf_zip_decompress(block, srv_use_checksums);
+ ut_a(success);
- if (UNIV_LIKELY(success)) {
+ if (UNIV_LIKELY(!recv_no_ibuf_operations)) {
ibuf_merge_or_delete_for_page(block, space, offset,
zip_size, TRUE);
}
@@ -2478,14 +2494,6 @@ wait_until_unfixed:
buf_pool->n_pend_unzip--;
mutex_exit(&buf_pool_mutex);
rw_lock_x_unlock(&block->lock);
-
- if (UNIV_UNLIKELY(!success)) {
-
- //buf_pool_mutex_exit();
- mutex_exit(block_mutex);
- return(NULL);
- }
-
break;
case BUF_BLOCK_ZIP_FREE:
@@ -2500,7 +2508,12 @@ wait_until_unfixed:
ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
//mutex_enter(&block->mutex);
+#if UNIV_WORD_SIZE == 4
+ /* On 32-bit systems, there is no padding in buf_page_t. On
+ other systems, Valgrind could complain about uninitialized pad
+ bytes. */
UNIV_MEM_ASSERT_RW(&block->page, sizeof block->page);
+#endif
buf_block_buf_fix_inc(block, file, line);
@@ -2603,8 +2616,8 @@ page.
@return TRUE if success */
UNIV_INTERN
ibool
-buf_page_optimistic_get_func(
-/*=========================*/
+buf_page_optimistic_get(
+/*====================*/
ulint rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH */
buf_block_t* block, /*!< in: guessed buffer block */
ib_uint64_t modify_clock,/*!< in: modify clock value if mode is
@@ -2618,7 +2631,9 @@ buf_page_optimistic_get_func(
ulint fix_type;
trx_t* trx = NULL;
- ut_ad(mtr && block);
+ ut_ad(block);
+ ut_ad(mtr);
+ ut_ad(mtr->state == MTR_ACTIVE);
ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH));
mutex_enter(&block->mutex);
@@ -2738,6 +2753,7 @@ buf_page_get_known_nowait(
trx_t* trx = NULL;
ut_ad(mtr);
+ ut_ad(mtr->state == MTR_ACTIVE);
ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH));
mutex_enter(&block->mutex);
@@ -2846,6 +2862,9 @@ buf_page_try_get_func(
ibool success;
ulint fix_type;
+ ut_ad(mtr);
+ ut_ad(mtr->state == MTR_ACTIVE);
+
//buf_pool_mutex_enter();
rw_lock_s_lock(&page_hash_latch);
block = buf_block_hash_get(space_id, page_no);
@@ -3249,6 +3268,7 @@ buf_page_create(
ulint time_ms = ut_time_ms();
ut_ad(mtr);
+ ut_ad(mtr->state == MTR_ACTIVE);
ut_ad(space || !zip_size);
free_block = buf_LRU_get_free_block(0);
@@ -3431,7 +3451,8 @@ buf_page_io_complete(
read_space_id = mach_read_from_4(
frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
- if (bpage->space == TRX_SYS_SPACE
+ if ((bpage->space == TRX_SYS_SPACE
+ || (srv_doublewrite_file && bpage->space == TRX_DOUBLEWRITE_SPACE))
&& trx_doublewrite_page_inside(bpage->offset)) {
ut_print_timestamp(stderr);
@@ -3503,7 +3524,7 @@ corrupt:
REFMAN "forcing-recovery.html\n"
"InnoDB: about forcing recovery.\n", stderr);
- if (srv_pass_corrupt_table && bpage->space > 0
+ if (srv_pass_corrupt_table && !trx_sys_sys_space(bpage->space)
&& bpage->space < SRV_LOG_SPACE_FIRST_ID) {
fprintf(stderr,
"InnoDB: space %u will be treated as corrupt.\n",
diff --git a/storage/xtradb/buf/buf0flu.c b/storage/xtradb/buf/buf0flu.c
index 1735f6ac3cb..17588475bbf 100644
--- a/storage/xtradb/buf/buf0flu.c
+++ b/storage/xtradb/buf/buf0flu.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -88,6 +88,146 @@ buf_flush_validate_low(void);
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
/********************************************************************//**
+Insert a block in the flush_rbt and returns a pointer to its
+predecessor or NULL if no predecessor. The ordering is maintained
+on the basis of the <oldest_modification, space, offset> key.
+@return pointer to the predecessor or NULL if no predecessor. */
+static
+buf_page_t*
+buf_flush_insert_in_flush_rbt(
+/*==========================*/
+ buf_page_t* bpage) /*!< in: bpage to be inserted. */
+{
+ buf_page_t* prev = NULL;
+ const ib_rbt_node_t* c_node;
+ const ib_rbt_node_t* p_node;
+
+ //ut_ad(buf_pool_mutex_own());
+ ut_ad(mutex_own(&flush_list_mutex));
+
+ /* Insert this buffer into the rbt. */
+ c_node = rbt_insert(buf_pool->flush_rbt, &bpage, &bpage);
+ ut_a(c_node != NULL);
+
+ /* Get the predecessor. */
+ p_node = rbt_prev(buf_pool->flush_rbt, c_node);
+
+ if (p_node != NULL) {
+ prev = *rbt_value(buf_page_t*, p_node);
+ ut_a(prev != NULL);
+ }
+
+ return(prev);
+}
+
+/********************************************************************//**
+Delete a bpage from the flush_rbt. */
+static
+void
+buf_flush_delete_from_flush_rbt(
+/*============================*/
+ buf_page_t* bpage) /*!< in: bpage to be removed. */
+{
+
+ ibool ret = FALSE;
+
+ //ut_ad(buf_pool_mutex_own());
+ ut_ad(mutex_own(&flush_list_mutex));
+ ret = rbt_delete(buf_pool->flush_rbt, &bpage);
+ ut_ad(ret);
+}
+
+/********************************************************************//**
+Compare two modified blocks in the buffer pool. The key for comparison
+is:
+key = <oldest_modification, space, offset>
+This comparison is used to maintian ordering of blocks in the
+buf_pool->flush_rbt.
+Note that for the purpose of flush_rbt, we only need to order blocks
+on the oldest_modification. The other two fields are used to uniquely
+identify the blocks.
+@return < 0 if b2 < b1, 0 if b2 == b1, > 0 if b2 > b1 */
+static
+int
+buf_flush_block_cmp(
+/*================*/
+ const void* p1, /*!< in: block1 */
+ const void* p2) /*!< in: block2 */
+{
+ int ret;
+ const buf_page_t* b1;
+ const buf_page_t* b2;
+
+ ut_ad(p1 != NULL);
+ ut_ad(p2 != NULL);
+
+ b1 = *(const buf_page_t**) p1;
+ b2 = *(const buf_page_t**) p2;
+
+ ut_ad(b1 != NULL);
+ ut_ad(b2 != NULL);
+
+ ut_ad(b1->in_flush_list);
+ ut_ad(b2->in_flush_list);
+
+ if (b2->oldest_modification
+ > b1->oldest_modification) {
+ return(1);
+ }
+
+ if (b2->oldest_modification
+ < b1->oldest_modification) {
+ return(-1);
+ }
+
+ /* If oldest_modification is same then decide on the space. */
+ ret = (int)(b2->space - b1->space);
+
+ /* Or else decide ordering on the offset field. */
+ return(ret ? ret : (int)(b2->offset - b1->offset));
+}
+
+/********************************************************************//**
+Initialize the red-black tree to speed up insertions into the flush_list
+during recovery process. Should be called at the start of recovery
+process before any page has been read/written. */
+UNIV_INTERN
+void
+buf_flush_init_flush_rbt(void)
+/*==========================*/
+{
+ //buf_pool_mutex_enter();
+ mutex_enter(&flush_list_mutex);
+
+ /* Create red black tree for speedy insertions in flush list. */
+ buf_pool->flush_rbt = rbt_create(sizeof(buf_page_t*),
+ buf_flush_block_cmp);
+ //buf_pool_mutex_exit();
+ mutex_exit(&flush_list_mutex);
+}
+
+/********************************************************************//**
+Frees up the red-black tree. */
+UNIV_INTERN
+void
+buf_flush_free_flush_rbt(void)
+/*==========================*/
+{
+ //buf_pool_mutex_enter();
+ mutex_enter(&flush_list_mutex);
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+ ut_a(buf_flush_validate_low());
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+ rbt_free(buf_pool->flush_rbt);
+ buf_pool->flush_rbt = NULL;
+
+ //buf_pool_mutex_exit();
+ mutex_exit(&flush_list_mutex);
+}
+
+/********************************************************************//**
Inserts a modified block into the flush list. */
UNIV_INTERN
void
@@ -102,6 +242,13 @@ buf_flush_insert_into_flush_list(
|| (UT_LIST_GET_FIRST(buf_pool->flush_list)->oldest_modification
<= block->page.oldest_modification));
+ /* If we are in the recovery then we need to update the flush
+ red-black tree as well. */
+ if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
+ buf_flush_insert_sorted_into_flush_list(block);
+ return;
+ }
+
ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
ut_ad(block->page.in_LRU_list);
ut_ad(block->page.in_page_hash);
@@ -140,26 +287,27 @@ buf_flush_insert_sorted_into_flush_list(
ut_d(block->page.in_flush_list = TRUE);
prev_b = NULL;
- b = UT_LIST_GET_FIRST(buf_pool->flush_list);
- if (srv_fast_recovery) {
- /* speed hack */
- if (b == NULL || b->oldest_modification < block->page.oldest_modification) {
- UT_LIST_ADD_FIRST(flush_list, buf_pool->flush_list, &block->page);
+ /* For the most part when this function is called the flush_rbt
+ should not be NULL. In a very rare boundary case it is possible
+ that the flush_rbt has already been freed by the recovery thread
+ before the last page was hooked up in the flush_list by the
+ io-handler thread. In that case we'll just do a simple
+ linear search in the else block. */
+ if (buf_pool->flush_rbt) {
+
+ prev_b = buf_flush_insert_in_flush_rbt(&block->page);
+
} else {
- b = UT_LIST_GET_LAST(buf_pool->flush_list);
- if (b->oldest_modification < block->page.oldest_modification) {
- /* align oldest_modification not to sort */
- block->page.oldest_modification = b->oldest_modification;
+
+ b = UT_LIST_GET_FIRST(buf_pool->flush_list);
+
+ while (b && b->oldest_modification
+ > block->page.oldest_modification) {
+ ut_ad(b->in_flush_list);
+ prev_b = b;
+ b = UT_LIST_GET_NEXT(flush_list, b);
}
- UT_LIST_ADD_LAST(flush_list, buf_pool->flush_list, &block->page);
- }
- } else {
- /* normal */
- while (b && b->oldest_modification > block->page.oldest_modification) {
- ut_ad(b->in_flush_list);
- prev_b = b;
- b = UT_LIST_GET_NEXT(flush_list, b);
}
if (prev_b == NULL) {
@@ -168,7 +316,6 @@ buf_flush_insert_sorted_into_flush_list(
UT_LIST_INSERT_AFTER(flush_list, buf_pool->flush_list,
prev_b, &block->page);
}
- }
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
ut_a(buf_flush_validate_low());
@@ -262,7 +409,6 @@ buf_flush_remove(
mutex_enter(&flush_list_mutex);
ut_ad(bpage->in_flush_list);
- ut_d(bpage->in_flush_list = FALSE);
switch (buf_page_get_state(bpage)) {
case BUF_BLOCK_ZIP_PAGE:
@@ -285,6 +431,15 @@ buf_flush_remove(
break;
}
+ /* If the flush_rbt is active then delete from it as well. */
+ if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
+ buf_flush_delete_from_flush_rbt(bpage);
+ }
+
+ /* Must be done after we have removed it from the flush_rbt
+ because we assert on in_flush_list in comparison function. */
+ ut_d(bpage->in_flush_list = FALSE);
+
bpage->oldest_modification = 0;
ut_d(UT_LIST_VALIDATE(flush_list, buf_page_t, buf_pool->flush_list,
@@ -293,6 +448,64 @@ buf_flush_remove(
}
/********************************************************************//**
+Relocates a buffer control block on the flush_list.
+Note that it is assumed that the contents of bpage has already been
+copied to dpage. */
+UNIV_INTERN
+void
+buf_flush_relocate_on_flush_list(
+/*=============================*/
+ buf_page_t* bpage, /*!< in/out: control block being moved */
+ buf_page_t* dpage) /*!< in/out: destination block */
+{
+ buf_page_t* prev;
+ buf_page_t* prev_b = NULL;
+
+ //ut_ad(buf_pool_mutex_own());
+ ut_ad(mutex_own(&flush_list_mutex));
+
+ ut_ad(mutex_own(buf_page_get_mutex(bpage)));
+
+ ut_ad(bpage->in_flush_list);
+ ut_ad(dpage->in_flush_list);
+
+ /* If recovery is active we must swap the control blocks in
+ the flush_rbt as well. */
+ if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
+ buf_flush_delete_from_flush_rbt(bpage);
+ prev_b = buf_flush_insert_in_flush_rbt(dpage);
+ }
+
+ /* Must be done after we have removed it from the flush_rbt
+ because we assert on in_flush_list in comparison function. */
+ ut_d(bpage->in_flush_list = FALSE);
+
+ prev = UT_LIST_GET_PREV(flush_list, bpage);
+ UT_LIST_REMOVE(flush_list, buf_pool->flush_list, bpage);
+
+ if (prev) {
+ ut_ad(prev->in_flush_list);
+ UT_LIST_INSERT_AFTER(
+ flush_list,
+ buf_pool->flush_list,
+ prev, dpage);
+ } else {
+ UT_LIST_ADD_FIRST(
+ flush_list,
+ buf_pool->flush_list,
+ dpage);
+ }
+
+ /* Just an extra check. Previous in flush_list
+ should be the same control block as in flush_rbt. */
+ ut_a(!buf_pool->flush_rbt || prev_b == prev);
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+ ut_a(buf_flush_validate_low());
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+}
+
+/********************************************************************//**
Updates the flush system data structures when a write is completed. */
UNIV_INTERN
void
@@ -452,7 +665,8 @@ corrupted_page:
write_buf = trx_doublewrite->write_buf;
i = 0;
- fil_io(OS_FILE_WRITE, TRUE, TRX_SYS_SPACE, 0,
+ fil_io(OS_FILE_WRITE, TRUE,
+ (srv_doublewrite_file ? TRX_DOUBLEWRITE_SPACE : TRX_SYS_SPACE), 0,
trx_doublewrite->block1, 0, len,
(void*) write_buf, NULL);
@@ -489,7 +703,8 @@ corrupted_page:
+ TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE;
ut_ad(i == TRX_SYS_DOUBLEWRITE_BLOCK_SIZE);
- fil_io(OS_FILE_WRITE, TRUE, TRX_SYS_SPACE, 0,
+ fil_io(OS_FILE_WRITE, TRUE,
+ (srv_doublewrite_file ? TRX_DOUBLEWRITE_SPACE : TRX_SYS_SPACE), 0,
trx_doublewrite->block2, 0, len,
(void*) write_buf, NULL);
@@ -519,7 +734,7 @@ corrupted_page:
flush:
/* Now flush the doublewrite buffer data to disk */
- fil_flush(TRX_SYS_SPACE);
+ fil_flush(srv_doublewrite_file ? TRX_DOUBLEWRITE_SPACE : TRX_SYS_SPACE);
/* We know that the writes have been flushed to disk now
and in recovery we will find them in the doublewrite buffer
@@ -1473,24 +1688,45 @@ ibool
buf_flush_validate_low(void)
/*========================*/
{
- buf_page_t* bpage;
+ buf_page_t* bpage;
+ const ib_rbt_node_t* rnode = NULL;
UT_LIST_VALIDATE(flush_list, buf_page_t, buf_pool->flush_list,
ut_ad(ut_list_node_313->in_flush_list));
bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
+ /* If we are in recovery mode i.e.: flush_rbt != NULL
+ then each block in the flush_list must also be present
+ in the flush_rbt. */
+ if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
+ rnode = rbt_first(buf_pool->flush_rbt);
+ }
+
while (bpage != NULL) {
const ib_uint64_t om = bpage->oldest_modification;
ut_ad(bpage->in_flush_list);
//ut_a(buf_page_in_file(bpage)); /* optimistic */
ut_a(om > 0);
+ if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
+ ut_a(rnode);
+ buf_page_t* rpage = *rbt_value(buf_page_t*,
+ rnode);
+ ut_a(rpage);
+ ut_a(rpage == bpage);
+ rnode = rbt_next(buf_pool->flush_rbt, rnode);
+ }
+
bpage = UT_LIST_GET_NEXT(flush_list, bpage);
ut_a(!bpage || om >= bpage->oldest_modification);
}
+ /* By this time we must have exhausted the traversal of
+ flush_rbt (if active) as well. */
+ ut_a(rnode == NULL);
+
return(TRUE);
}
diff --git a/storage/xtradb/buf/buf0lru.c b/storage/xtradb/buf/buf0lru.c
index 58e2c23275b..65fdf342e4f 100644
--- a/storage/xtradb/buf/buf0lru.c
+++ b/storage/xtradb/buf/buf0lru.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -371,21 +371,39 @@ scan_again:
bpage = UT_LIST_GET_LAST(buf_pool->LRU);
while (bpage != NULL) {
- mutex_t* block_mutex = buf_page_get_mutex_enter(bpage);
buf_page_t* prev_bpage;
+ ibool prev_bpage_buf_fix = FALSE;
ut_a(buf_page_in_file(bpage));
prev_bpage = UT_LIST_GET_PREV(LRU, bpage);
- if (!block_mutex) {
- bpage = prev_bpage;
- continue;
- }
+ /* bpage->space and bpage->io_fix are protected by
+ buf_pool_mutex and block_mutex. It is safe to check
+ them while holding buf_pool_mutex only. */
+
+ if (buf_page_get_space(bpage) != id) {
+ /* Skip this block, as it does not belong to
+ the space that is being invalidated. */
+ } else if (buf_page_get_io_fix(bpage) != BUF_IO_NONE) {
+ /* We cannot remove this page during this scan
+ yet; maybe the system is currently reading it
+ in, or flushing the modifications to the file */
+
+ all_freed = FALSE;
+ } else {
+ mutex_t* block_mutex = buf_page_get_mutex_enter(bpage);
- if (buf_page_get_space(bpage) == id) {
- if (bpage->buf_fix_count > 0
- || buf_page_get_io_fix(bpage) != BUF_IO_NONE) {
+ if (!block_mutex) {
+ /* It may be impossible case...
+ Something wrong, so will be scan_again */
+
+ all_freed = FALSE;
+
+ goto next_page_no_mutex;
+ }
+
+ if (bpage->buf_fix_count > 0) {
/* We cannot remove this page during
this scan yet; maybe the system is
@@ -405,8 +423,40 @@ scan_again:
(ulong) buf_page_get_page_no(bpage));
}
#endif
- if (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE
- && ((buf_block_t*) bpage)->is_hashed) {
+ if (buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE) {
+ /* This is a compressed-only block
+ descriptor. Ensure that prev_bpage
+ cannot be relocated when bpage is freed. */
+ if (UNIV_LIKELY(prev_bpage != NULL)) {
+ switch (buf_page_get_state(
+ prev_bpage)) {
+ case BUF_BLOCK_FILE_PAGE:
+ /* Descriptors of uncompressed
+ blocks will not be relocated,
+ because we are holding the
+ buf_pool_mutex. */
+ break;
+ case BUF_BLOCK_ZIP_PAGE:
+ case BUF_BLOCK_ZIP_DIRTY:
+ /* Descriptors of compressed-
+ only blocks can be relocated,
+ unless they are buffer-fixed.
+ Because both bpage and
+ prev_bpage are protected by
+ buf_pool_zip_mutex, it is
+ not necessary to acquire
+ further mutexes. */
+ ut_ad(&buf_pool_zip_mutex
+ == block_mutex);
+ ut_ad(mutex_own(block_mutex));
+ prev_bpage_buf_fix = TRUE;
+ prev_bpage->buf_fix_count++;
+ break;
+ default:
+ ut_error;
+ }
+ }
+ } else if (((buf_block_t*) bpage)->is_hashed) {
ulint page_no;
ulint zip_size;
@@ -432,7 +482,8 @@ scan_again:
buf_flush_remove(bpage);
}
- /* Remove from the LRU list */
+ /* Remove from the LRU list. */
+
if (buf_LRU_block_remove_hashed_page(bpage, TRUE)
!= BUF_BLOCK_ZIP_FREE) {
buf_LRU_block_free_hashed_page((buf_block_t*)
@@ -444,18 +495,27 @@ scan_again:
ut_ad(block_mutex == &buf_pool_zip_mutex);
ut_ad(!mutex_own(block_mutex));
- /* The compressed block descriptor
- (bpage) has been deallocated and
- block_mutex released. Also,
- buf_buddy_free() may have relocated
- prev_bpage. Rescan the LRU list. */
+ if (prev_bpage_buf_fix) {
+ /* We temporarily buffer-fixed
+ prev_bpage, so that
+ buf_buddy_free() could not
+ relocate it, in case it was a
+ compressed-only block
+ descriptor. */
+
+ mutex_enter(block_mutex);
+ ut_ad(prev_bpage->buf_fix_count > 0);
+ prev_bpage->buf_fix_count--;
+ mutex_exit(block_mutex);
+ }
- bpage = UT_LIST_GET_LAST(buf_pool->LRU);
- continue;
+ goto next_page_no_mutex;
}
- }
next_page:
- mutex_exit(block_mutex);
+ mutex_exit(block_mutex);
+ }
+
+next_page_no_mutex:
bpage = prev_bpage;
}
@@ -1425,7 +1485,12 @@ buf_LRU_free_block(
ut_ad(buf_page_in_file(bpage));
//ut_ad(bpage->in_LRU_list);
ut_ad(!bpage->in_flush_list == !bpage->oldest_modification);
+#if UNIV_WORD_SIZE == 4
+ /* On 32-bit systems, there is no padding in buf_page_t. On
+ other systems, Valgrind could complain about uninitialized pad
+ bytes. */
UNIV_MEM_ASSERT_RW(bpage, sizeof *bpage);
+#endif
if (!bpage->in_LRU_list || !block_mutex || !buf_page_can_relocate(bpage)) {
@@ -1558,8 +1623,13 @@ not_freed:
ut_ad(prev_b->in_LRU_list);
ut_ad(buf_page_in_file(prev_b));
+#if UNIV_WORD_SIZE == 4
+ /* On 32-bit systems, there is no
+ padding in buf_page_t. On other
+ systems, Valgrind could complain about
+ uninitialized pad bytes. */
UNIV_MEM_ASSERT_RW(prev_b, sizeof *prev_b);
-
+#endif
UT_LIST_INSERT_AFTER(LRU, buf_pool->LRU,
prev_b, b);
@@ -1600,26 +1670,8 @@ not_freed:
if (b->state == BUF_BLOCK_ZIP_PAGE) {
buf_LRU_insert_zip_clean(b);
} else {
- buf_page_t* prev;
-
- ut_ad(b->in_flush_list);
- ut_d(bpage->in_flush_list = FALSE);
-
- prev = UT_LIST_GET_PREV(flush_list, b);
- UT_LIST_REMOVE(flush_list, buf_pool->flush_list, b);
-
- if (prev) {
- ut_ad(prev->in_flush_list);
- UT_LIST_INSERT_AFTER(
- flush_list,
- buf_pool->flush_list,
- prev, b);
- } else {
- UT_LIST_ADD_FIRST(
- flush_list,
- buf_pool->flush_list,
- b);
- }
+ /* Relocate on buf_pool->flush_list. */
+ buf_flush_relocate_on_flush_list(bpage, b);
}
mutex_exit(&flush_list_mutex);
@@ -1792,7 +1844,12 @@ buf_LRU_block_remove_hashed_page(
ut_a(buf_page_get_io_fix(bpage) == BUF_IO_NONE);
ut_a(bpage->buf_fix_count == 0);
+#if UNIV_WORD_SIZE == 4
+ /* On 32-bit systems, there is no padding in
+ buf_page_t. On other systems, Valgrind could complain
+ about uninitialized pad bytes. */
UNIV_MEM_ASSERT_RW(bpage, sizeof *bpage);
+#endif
buf_LRU_remove_block(bpage);
diff --git a/storage/xtradb/buf/buf0rea.c b/storage/xtradb/buf/buf0rea.c
index e5d04df797f..59de70d9a8a 100644
--- a/storage/xtradb/buf/buf0rea.c
+++ b/storage/xtradb/buf/buf0rea.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -86,7 +86,9 @@ buf_read_page_low(
wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER;
mode = mode & ~OS_AIO_SIMULATED_WAKE_LATER;
- if (trx_doublewrite && space == TRX_SYS_SPACE
+ if (trx_doublewrite
+ && (space == TRX_SYS_SPACE
+ || (srv_doublewrite_file && space == TRX_DOUBLEWRITE_SPACE))
&& ( (offset >= trx_doublewrite->block1
&& offset < trx_doublewrite->block1
+ TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)
@@ -656,6 +658,50 @@ buf_read_recv_pages(
/* It is a single table tablespace and the .ibd file is
missing: do nothing */
+ /* the log records should be treated here same reason
+ for http://bugs.mysql.com/bug.php?id=43948 */
+
+ if (recv_recovery_is_on()) {
+ recv_addr_t* recv_addr;
+
+ mutex_enter(&(recv_sys->mutex));
+
+ if (recv_sys->apply_log_recs == FALSE) {
+ mutex_exit(&(recv_sys->mutex));
+ goto not_to_recover;
+ }
+
+ for (i = 0; i < n_stored; i++) {
+ /* recv_get_fil_addr_struct() */
+ recv_addr = HASH_GET_FIRST(recv_sys->addr_hash,
+ hash_calc_hash(ut_fold_ulint_pair(space, page_nos[i]),
+ recv_sys->addr_hash));
+ while (recv_addr) {
+ if ((recv_addr->space == space)
+ && (recv_addr->page_no == page_nos[i])) {
+ break;
+ }
+ recv_addr = HASH_GET_NEXT(addr_hash, recv_addr);
+ }
+
+ if ((recv_addr == NULL)
+ || (recv_addr->state == RECV_BEING_PROCESSED)
+ || (recv_addr->state == RECV_PROCESSED)) {
+ continue;
+ }
+
+ recv_addr->state = RECV_PROCESSED;
+
+ ut_a(recv_sys->n_addrs);
+ recv_sys->n_addrs--;
+ }
+
+ mutex_exit(&(recv_sys->mutex));
+
+ fprintf(stderr, " (cannot find space: %lu)", space);
+ }
+not_to_recover:
+
return;
}
@@ -674,10 +720,10 @@ buf_read_recv_pages(
count++;
- if (count > 5000) {
+ if (count > 1000) {
fprintf(stderr,
"InnoDB: Error: InnoDB has waited for"
- " 50 seconds for pending\n"
+ " 10 seconds for pending\n"
"InnoDB: reads to the buffer pool to"
" be finished.\n"
"InnoDB: Number of pending reads %lu,"
diff --git a/storage/xtradb/data/data0data.c b/storage/xtradb/data/data0data.c
index e3c1f1b4f23..0715b49bf9c 100644
--- a/storage/xtradb/data/data0data.c
+++ b/storage/xtradb/data/data0data.c
@@ -666,6 +666,21 @@ dtuple_convert_big_rec(
goto skip_field;
}
+ /* In DYNAMIC and COMPRESSED format, store
+ locally any non-BLOB columns whose maximum
+ length does not exceed 256 bytes. This is
+ because there is no room for the "external
+ storage" flag when the maximum length is 255
+ bytes or less. This restriction trivially
+ holds in REDUNDANT and COMPACT format, because
+ there we always store locally columns whose
+ length is up to local_len == 788 bytes.
+ @see rec_init_offsets_comp_ordinary */
+ if (ifield->col->mtype != DATA_BLOB
+ && ifield->col->len < 256) {
+ goto skip_field;
+ }
+
longest_i = i;
longest = savings;
diff --git a/storage/xtradb/dict/dict0boot.c b/storage/xtradb/dict/dict0boot.c
index 0eb73e6c2f9..0a713f0deaa 100644
--- a/storage/xtradb/dict/dict0boot.c
+++ b/storage/xtradb/dict/dict0boot.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -275,6 +275,9 @@ dict_boot(void)
and (TYPE & DICT_TF_FORMAT_MASK) are nonzero and TYPE = table->flags */
dict_mem_table_add_col(table, heap, "TYPE", DATA_INT, 0, 4);
dict_mem_table_add_col(table, heap, "MIX_ID", DATA_BINARY, 0, 0);
+ /* MIX_LEN may contain additional table flags when
+ ROW_FORMAT!=REDUNDANT. Currently, these flags include
+ DICT_TF2_TEMPORARY. */
dict_mem_table_add_col(table, heap, "MIX_LEN", DATA_INT, 0, 4);
dict_mem_table_add_col(table, heap, "CLUSTER_NAME", DATA_BINARY, 0, 0);
dict_mem_table_add_col(table, heap, "SPACE", DATA_INT, 0, 4);
@@ -358,7 +361,7 @@ dict_boot(void)
dict_mem_table_add_col(table, heap, "SPACE", DATA_INT, 0, 4);
dict_mem_table_add_col(table, heap, "PAGE_NO", DATA_INT, 0, 4);
- /* The '+ 2' below comes from the 2 system fields */
+ /* The '+ 2' below comes from the fields DB_TRX_ID, DB_ROLL_PTR */
#if DICT_SYS_INDEXES_PAGE_NO_FIELD != 6 + 2
#error "DICT_SYS_INDEXES_PAGE_NO_FIELD != 6 + 2"
#endif
@@ -368,6 +371,9 @@ dict_boot(void)
#if DICT_SYS_INDEXES_TYPE_FIELD != 4 + 2
#error "DICT_SYS_INDEXES_TYPE_FIELD != 4 + 2"
#endif
+#if DICT_SYS_INDEXES_NAME_FIELD != 2 + 2
+#error "DICT_SYS_INDEXES_NAME_FIELD != 2 + 2"
+#endif
table->id = DICT_INDEXES_ID;
dict_table_add_to_cache(table, heap);
diff --git a/storage/xtradb/dict/dict0crea.c b/storage/xtradb/dict/dict0crea.c
index e315716551e..d4e735c73ca 100644
--- a/storage/xtradb/dict/dict0crea.c
+++ b/storage/xtradb/dict/dict0crea.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -51,16 +51,18 @@ static
dtuple_t*
dict_create_sys_tables_tuple(
/*=========================*/
- dict_table_t* table, /*!< in: table */
- mem_heap_t* heap) /*!< in: memory heap from which the memory for
- the built tuple is allocated */
+ const dict_table_t* table, /*!< in: table */
+ mem_heap_t* heap) /*!< in: memory heap from
+ which the memory for the built
+ tuple is allocated */
{
dict_table_t* sys_tables;
dtuple_t* entry;
dfield_t* dfield;
byte* ptr;
- ut_ad(table && heap);
+ ut_ad(table);
+ ut_ad(heap);
sys_tables = dict_sys->sys_tables;
@@ -69,18 +71,18 @@ dict_create_sys_tables_tuple(
dict_table_copy_types(entry, sys_tables);
/* 0: NAME -----------------------------*/
- dfield = dtuple_get_nth_field(entry, 0);
+ dfield = dtuple_get_nth_field(entry, 0/*NAME*/);
dfield_set_data(dfield, table->name, ut_strlen(table->name));
/* 3: ID -------------------------------*/
- dfield = dtuple_get_nth_field(entry, 1);
+ dfield = dtuple_get_nth_field(entry, 1/*ID*/);
ptr = mem_heap_alloc(heap, 8);
mach_write_to_8(ptr, table->id);
dfield_set_data(dfield, ptr, 8);
/* 4: N_COLS ---------------------------*/
- dfield = dtuple_get_nth_field(entry, 2);
+ dfield = dtuple_get_nth_field(entry, 2/*N_COLS*/);
#if DICT_TF_COMPACT != 1
#error
@@ -91,40 +93,41 @@ dict_create_sys_tables_tuple(
| ((table->flags & DICT_TF_COMPACT) << 31));
dfield_set_data(dfield, ptr, 4);
/* 5: TYPE -----------------------------*/
- dfield = dtuple_get_nth_field(entry, 3);
+ dfield = dtuple_get_nth_field(entry, 3/*TYPE*/);
ptr = mem_heap_alloc(heap, 4);
- if (table->flags & ~DICT_TF_COMPACT) {
+ if (table->flags & (~DICT_TF_COMPACT & ~(~0 << DICT_TF_BITS))) {
ut_a(table->flags & DICT_TF_COMPACT);
ut_a(dict_table_get_format(table) >= DICT_TF_FORMAT_ZIP);
ut_a(((ulonglong) table->flags & DICT_TF_ZSSIZE_MASK)
<= (ulonglong) (DICT_TF_ZSSIZE_MAX << DICT_TF_ZSSIZE_SHIFT));
- ut_a(!(table->flags & (~0 << DICT_TF_BITS)));
- mach_write_to_4(ptr, table->flags);
+ ut_a(!(table->flags & (~0 << DICT_TF2_BITS)));
+ mach_write_to_4(ptr, table->flags & ~(~0 << DICT_TF_BITS));
} else {
mach_write_to_4(ptr, DICT_TABLE_ORDINARY);
}
dfield_set_data(dfield, ptr, 4);
/* 6: MIX_ID (obsolete) ---------------------------*/
- dfield = dtuple_get_nth_field(entry, 4);
+ dfield = dtuple_get_nth_field(entry, 4/*MIX_ID*/);
ptr = mem_heap_zalloc(heap, 8);
dfield_set_data(dfield, ptr, 8);
- /* 7: MIX_LEN (obsolete) --------------------------*/
+ /* 7: MIX_LEN (additional flags) --------------------------*/
- dfield = dtuple_get_nth_field(entry, 5);
+ dfield = dtuple_get_nth_field(entry, 5/*MIX_LEN*/);
- ptr = mem_heap_zalloc(heap, 4);
+ ptr = mem_heap_alloc(heap, 4);
+ mach_write_to_4(ptr, table->flags >> DICT_TF2_SHIFT);
dfield_set_data(dfield, ptr, 4);
/* 8: CLUSTER_NAME ---------------------*/
- dfield = dtuple_get_nth_field(entry, 6);
+ dfield = dtuple_get_nth_field(entry, 6/*CLUSTER_NAME*/);
dfield_set_null(dfield); /* not supported */
/* 9: SPACE ----------------------------*/
- dfield = dtuple_get_nth_field(entry, 7);
+ dfield = dtuple_get_nth_field(entry, 7/*SPACE*/);
ptr = mem_heap_alloc(heap, 4);
mach_write_to_4(ptr, table->space);
@@ -143,19 +146,21 @@ static
dtuple_t*
dict_create_sys_columns_tuple(
/*==========================*/
- dict_table_t* table, /*!< in: table */
- ulint i, /*!< in: column number */
- mem_heap_t* heap) /*!< in: memory heap from which the memory for
- the built tuple is allocated */
+ const dict_table_t* table, /*!< in: table */
+ ulint i, /*!< in: column number */
+ mem_heap_t* heap) /*!< in: memory heap from
+ which the memory for the built
+ tuple is allocated */
{
dict_table_t* sys_columns;
dtuple_t* entry;
const dict_col_t* column;
dfield_t* dfield;
byte* ptr;
- const char* col_name;
+ const char* col_name;
- ut_ad(table && heap);
+ ut_ad(table);
+ ut_ad(heap);
column = dict_table_get_nth_col(table, i);
@@ -166,47 +171,47 @@ dict_create_sys_columns_tuple(
dict_table_copy_types(entry, sys_columns);
/* 0: TABLE_ID -----------------------*/
- dfield = dtuple_get_nth_field(entry, 0);
+ dfield = dtuple_get_nth_field(entry, 0/*TABLE_ID*/);
ptr = mem_heap_alloc(heap, 8);
mach_write_to_8(ptr, table->id);
dfield_set_data(dfield, ptr, 8);
/* 1: POS ----------------------------*/
- dfield = dtuple_get_nth_field(entry, 1);
+ dfield = dtuple_get_nth_field(entry, 1/*POS*/);
ptr = mem_heap_alloc(heap, 4);
mach_write_to_4(ptr, i);
dfield_set_data(dfield, ptr, 4);
/* 4: NAME ---------------------------*/
- dfield = dtuple_get_nth_field(entry, 2);
+ dfield = dtuple_get_nth_field(entry, 2/*NAME*/);
col_name = dict_table_get_col_name(table, i);
dfield_set_data(dfield, col_name, ut_strlen(col_name));
/* 5: MTYPE --------------------------*/
- dfield = dtuple_get_nth_field(entry, 3);
+ dfield = dtuple_get_nth_field(entry, 3/*MTYPE*/);
ptr = mem_heap_alloc(heap, 4);
mach_write_to_4(ptr, column->mtype);
dfield_set_data(dfield, ptr, 4);
/* 6: PRTYPE -------------------------*/
- dfield = dtuple_get_nth_field(entry, 4);
+ dfield = dtuple_get_nth_field(entry, 4/*PRTYPE*/);
ptr = mem_heap_alloc(heap, 4);
mach_write_to_4(ptr, column->prtype);
dfield_set_data(dfield, ptr, 4);
/* 7: LEN ----------------------------*/
- dfield = dtuple_get_nth_field(entry, 5);
+ dfield = dtuple_get_nth_field(entry, 5/*LEN*/);
ptr = mem_heap_alloc(heap, 4);
mach_write_to_4(ptr, column->len);
dfield_set_data(dfield, ptr, 4);
/* 8: PREC ---------------------------*/
- dfield = dtuple_get_nth_field(entry, 6);
+ dfield = dtuple_get_nth_field(entry, 6/*PREC*/);
ptr = mem_heap_alloc(heap, 4);
mach_write_to_4(ptr, 0/* unused */);
@@ -230,6 +235,7 @@ dict_build_table_def_step(
dict_table_t* table;
dtuple_t* row;
ulint error;
+ ulint flags;
const char* path_or_name;
ibool is_path;
mtr_t mtr;
@@ -268,9 +274,10 @@ dict_build_table_def_step(
ut_ad(!dict_table_zip_size(table)
|| dict_table_get_format(table) >= DICT_TF_FORMAT_ZIP);
+ flags = table->flags & ~(~0 << DICT_TF_BITS);
error = fil_create_new_single_table_tablespace(
&space, path_or_name, is_path,
- table->flags == DICT_TF_COMPACT ? 0 : table->flags,
+ flags == DICT_TF_COMPACT ? 0 : flags,
FIL_IBD_FILE_INITIAL_SIZE);
table->space = (unsigned int) space;
@@ -286,7 +293,7 @@ dict_build_table_def_step(
mtr_commit(&mtr);
} else {
/* Create in the system tablespace: disallow new features */
- table->flags &= DICT_TF_COMPACT;
+ table->flags &= (~0 << DICT_TF_BITS) | DICT_TF_COMPACT;
}
row = dict_create_sys_tables_tuple(table, node->heap);
@@ -322,9 +329,10 @@ static
dtuple_t*
dict_create_sys_indexes_tuple(
/*==========================*/
- dict_index_t* index, /*!< in: index */
- mem_heap_t* heap) /*!< in: memory heap from which the memory for
- the built tuple is allocated */
+ const dict_index_t* index, /*!< in: index */
+ mem_heap_t* heap) /*!< in: memory heap from
+ which the memory for the built
+ tuple is allocated */
{
dict_table_t* sys_indexes;
dict_table_t* table;
@@ -333,7 +341,8 @@ dict_create_sys_indexes_tuple(
byte* ptr;
ut_ad(mutex_own(&(dict_sys->mutex)));
- ut_ad(index && heap);
+ ut_ad(index);
+ ut_ad(heap);
sys_indexes = dict_sys->sys_indexes;
@@ -344,32 +353,32 @@ dict_create_sys_indexes_tuple(
dict_table_copy_types(entry, sys_indexes);
/* 0: TABLE_ID -----------------------*/
- dfield = dtuple_get_nth_field(entry, 0);
+ dfield = dtuple_get_nth_field(entry, 0/*TABLE_ID*/);
ptr = mem_heap_alloc(heap, 8);
mach_write_to_8(ptr, table->id);
dfield_set_data(dfield, ptr, 8);
/* 1: ID ----------------------------*/
- dfield = dtuple_get_nth_field(entry, 1);
+ dfield = dtuple_get_nth_field(entry, 1/*ID*/);
ptr = mem_heap_alloc(heap, 8);
mach_write_to_8(ptr, index->id);
dfield_set_data(dfield, ptr, 8);
/* 4: NAME --------------------------*/
- dfield = dtuple_get_nth_field(entry, 2);
+ dfield = dtuple_get_nth_field(entry, 2/*NAME*/);
dfield_set_data(dfield, index->name, ut_strlen(index->name));
/* 5: N_FIELDS ----------------------*/
- dfield = dtuple_get_nth_field(entry, 3);
+ dfield = dtuple_get_nth_field(entry, 3/*N_FIELDS*/);
ptr = mem_heap_alloc(heap, 4);
mach_write_to_4(ptr, index->n_fields);
dfield_set_data(dfield, ptr, 4);
/* 6: TYPE --------------------------*/
- dfield = dtuple_get_nth_field(entry, 4);
+ dfield = dtuple_get_nth_field(entry, 4/*TYPE*/);
ptr = mem_heap_alloc(heap, 4);
mach_write_to_4(ptr, index->type);
@@ -381,7 +390,7 @@ dict_create_sys_indexes_tuple(
#error "DICT_SYS_INDEXES_SPACE_NO_FIELD != 7"
#endif
- dfield = dtuple_get_nth_field(entry, 5);
+ dfield = dtuple_get_nth_field(entry, 5/*SPACE*/);
ptr = mem_heap_alloc(heap, 4);
mach_write_to_4(ptr, index->space);
@@ -393,7 +402,7 @@ dict_create_sys_indexes_tuple(
#error "DICT_SYS_INDEXES_PAGE_NO_FIELD != 8"
#endif
- dfield = dtuple_get_nth_field(entry, 6);
+ dfield = dtuple_get_nth_field(entry, 6/*PAGE_NO*/);
ptr = mem_heap_alloc(heap, 4);
mach_write_to_4(ptr, FIL_NULL);
@@ -412,10 +421,11 @@ static
dtuple_t*
dict_create_sys_fields_tuple(
/*=========================*/
- dict_index_t* index, /*!< in: index */
- ulint i, /*!< in: field number */
- mem_heap_t* heap) /*!< in: memory heap from which the memory for
- the built tuple is allocated */
+ const dict_index_t* index, /*!< in: index */
+ ulint i, /*!< in: field number */
+ mem_heap_t* heap) /*!< in: memory heap from
+ which the memory for the built
+ tuple is allocated */
{
dict_table_t* sys_fields;
dtuple_t* entry;
@@ -425,7 +435,8 @@ dict_create_sys_fields_tuple(
ibool index_contains_column_prefix_field = FALSE;
ulint j;
- ut_ad(index && heap);
+ ut_ad(index);
+ ut_ad(heap);
for (j = 0; j < index->n_fields; j++) {
if (dict_index_get_nth_field(index, j)->prefix_len > 0) {
@@ -443,7 +454,7 @@ dict_create_sys_fields_tuple(
dict_table_copy_types(entry, sys_fields);
/* 0: INDEX_ID -----------------------*/
- dfield = dtuple_get_nth_field(entry, 0);
+ dfield = dtuple_get_nth_field(entry, 0/*INDEX_ID*/);
ptr = mem_heap_alloc(heap, 8);
mach_write_to_8(ptr, index->id);
@@ -451,7 +462,7 @@ dict_create_sys_fields_tuple(
dfield_set_data(dfield, ptr, 8);
/* 1: POS + PREFIX LENGTH ----------------------------*/
- dfield = dtuple_get_nth_field(entry, 1);
+ dfield = dtuple_get_nth_field(entry, 1/*POS*/);
ptr = mem_heap_alloc(heap, 4);
@@ -471,7 +482,7 @@ dict_create_sys_fields_tuple(
dfield_set_data(dfield, ptr, 4);
/* 4: COL_NAME -------------------------*/
- dfield = dtuple_get_nth_field(entry, 2);
+ dfield = dtuple_get_nth_field(entry, 2/*COL_NAME*/);
dfield_set_data(dfield, field->name,
ut_strlen(field->name));
@@ -602,6 +613,7 @@ dict_create_index_tree_step(
dict_table_t* sys_indexes;
dict_table_t* table;
dtuple_t* search_tuple;
+ ulint zip_size;
btr_pcur_t pcur;
mtr_t mtr;
@@ -626,8 +638,9 @@ dict_create_index_tree_step(
btr_pcur_move_to_next_user_rec(&pcur, &mtr);
- node->page_no = btr_create(index->type, index->space,
- dict_table_zip_size(index->table),
+ zip_size = dict_table_zip_size(index->table);
+
+ node->page_no = btr_create(index->type, index->space, zip_size,
index->id, index, &mtr);
/* printf("Created a new index tree in space %lu root page %lu\n",
index->space, index->page_no); */
@@ -1092,8 +1105,11 @@ dict_create_index_step(
dulint index_id = node->index->id;
- err = dict_index_add_to_cache(node->table, node->index,
- FIL_NULL, TRUE);
+ err = dict_index_add_to_cache(
+ node->table, node->index, FIL_NULL,
+ trx_is_strict(trx)
+ || dict_table_get_format(node->table)
+ >= DICT_TF_FORMAT_ZIP);
node->index = dict_index_get_if_in_cache_low(index_id);
ut_a(!node->index == (err != DB_SUCCESS));
diff --git a/storage/xtradb/dict/dict0dict.c b/storage/xtradb/dict/dict0dict.c
index 7b4174b4b15..1d088b2e02b 100644
--- a/storage/xtradb/dict/dict0dict.c
+++ b/storage/xtradb/dict/dict0dict.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -81,6 +81,10 @@ UNIV_INTERN rw_lock_t dict_operation_lock;
/** Identifies generated InnoDB foreign key names */
static char dict_ibfk[] = "_ibfk_";
+/** array of mutexes protecting dict_index_t::stat_n_diff_key_vals[] */
+#define DICT_INDEX_STAT_MUTEX_SIZE 32
+mutex_t dict_index_stat_mutex[DICT_INDEX_STAT_MUTEX_SIZE];
+
/*******************************************************************//**
Tries to find column names for the index and sets the col field of the
index.
@@ -141,7 +145,7 @@ static
void
dict_field_print_low(
/*=================*/
- dict_field_t* field); /*!< in: field */
+ const dict_field_t* field); /*!< in: field */
/*********************************************************************//**
Frees a foreign key struct. */
static
@@ -240,6 +244,45 @@ dict_mutex_exit_for_mysql(void)
mutex_exit(&(dict_sys->mutex));
}
+/** Get the mutex that protects index->stat_n_diff_key_vals[] */
+#define GET_INDEX_STAT_MUTEX(index) \
+ (&dict_index_stat_mutex[ut_fold_dulint(index->id) \
+ % DICT_INDEX_STAT_MUTEX_SIZE])
+
+/**********************************************************************//**
+Lock the appropriate mutex to protect index->stat_n_diff_key_vals[].
+index->id is used to pick the right mutex and it should not change
+before dict_index_stat_mutex_exit() is called on this index. */
+UNIV_INTERN
+void
+dict_index_stat_mutex_enter(
+/*========================*/
+ const dict_index_t* index) /*!< in: index */
+{
+ ut_ad(index != NULL);
+ ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+ ut_ad(index->cached);
+ ut_ad(!index->to_be_dropped);
+
+ mutex_enter(GET_INDEX_STAT_MUTEX(index));
+}
+
+/**********************************************************************//**
+Unlock the appropriate mutex that protects index->stat_n_diff_key_vals[]. */
+UNIV_INTERN
+void
+dict_index_stat_mutex_exit(
+/*=======================*/
+ const dict_index_t* index) /*!< in: index */
+{
+ ut_ad(index != NULL);
+ ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+ ut_ad(index->cached);
+ ut_ad(!index->to_be_dropped);
+
+ mutex_exit(GET_INDEX_STAT_MUTEX(index));
+}
+
/********************************************************************//**
Decrements the count of open MySQL handles to a table. */
UNIV_INTERN
@@ -608,6 +651,8 @@ void
dict_init(void)
/*===========*/
{
+ int i;
+
dict_sys = mem_alloc(sizeof(dict_sys_t));
mutex_create(&dict_sys->mutex, SYNC_DICT);
@@ -628,6 +673,10 @@ dict_init(void)
ut_a(dict_foreign_err_file);
mutex_create(&dict_foreign_err_mutex, SYNC_ANY_LATCH);
+
+ for (i = 0; i < DICT_INDEX_STAT_MUTEX_SIZE; i++) {
+ mutex_create(&dict_index_stat_mutex[i], SYNC_INDEX_TREE);
+ }
}
/**********************************************************************//**
@@ -1442,11 +1491,7 @@ dict_index_too_big_for_tree(
goto add_field_size;
}
- if (srv_relax_table_creation) {
- field_max_size = dict_col_get_min_size(col);
- } else {
field_max_size = dict_col_get_max_size(col);
- }
field_ext_max_size = field_max_size < 256 ? 1 : 2;
if (field->prefix_len) {
@@ -1527,6 +1572,7 @@ dict_index_add_to_cache(
if (!dict_index_find_cols(table, index)) {
+ dict_mem_index_free(index);
return(DB_CORRUPTION);
}
@@ -4247,9 +4293,13 @@ dict_update_statistics_low(
index = dict_table_get_first_index(table);
+ dict_index_stat_mutex_enter(index);
+
table->stat_n_rows = index->stat_n_diff_key_vals[
dict_index_get_n_unique(index)];
+ dict_index_stat_mutex_exit(index);
+
table->stat_clustered_index_size = index->stat_index_size;
table->stat_sum_of_other_index_sizes = sum_of_index_sizes
@@ -4428,6 +4478,8 @@ dict_index_print_low(
ut_ad(mutex_own(&(dict_sys->mutex)));
+ dict_index_stat_mutex_enter(index);
+
if (index->n_user_defined_cols > 0) {
n_vals = index->stat_n_diff_key_vals[
index->n_user_defined_cols];
@@ -4435,6 +4487,8 @@ dict_index_print_low(
n_vals = index->stat_n_diff_key_vals[1];
}
+ dict_index_stat_mutex_exit(index);
+
if (dict_index_is_clust(index)) {
type_string = "clustered index";
} else if (dict_index_is_unique(index)) {
@@ -4480,7 +4534,7 @@ static
void
dict_field_print_low(
/*=================*/
- dict_field_t* field) /*!< in: field */
+ const dict_field_t* field) /*!< in: field */
{
ut_ad(mutex_own(&(dict_sys->mutex)));
@@ -4844,8 +4898,10 @@ UNIV_INTERN
void
dict_table_check_for_dup_indexes(
/*=============================*/
- const dict_table_t* table) /*!< in: Check for dup indexes
+ const dict_table_t* table, /*!< in: Check for dup indexes
in this table */
+ ibool tmp_ok) /*!< in: TRUE=allow temporary
+ index names */
{
/* Check for duplicates, ignoring indexes that are marked
as to be dropped */
@@ -4853,13 +4909,17 @@ dict_table_check_for_dup_indexes(
const dict_index_t* index1;
const dict_index_t* index2;
+ ut_ad(mutex_own(&dict_sys->mutex));
+
/* The primary index _must_ exist */
ut_a(UT_LIST_GET_LEN(table->indexes) > 0);
index1 = UT_LIST_GET_FIRST(table->indexes);
- index2 = UT_LIST_GET_NEXT(indexes, index1);
- while (index1 && index2) {
+ do {
+ ut_ad(tmp_ok || *index1->name != TEMP_INDEX_PREFIX);
+
+ index2 = UT_LIST_GET_NEXT(indexes, index1);
while (index2) {
@@ -4871,8 +4931,7 @@ dict_table_check_for_dup_indexes(
}
index1 = UT_LIST_GET_NEXT(indexes, index1);
- index2 = UT_LIST_GET_NEXT(indexes, index1);
- }
+ } while (index1);
}
#endif /* UNIV_DEBUG */
@@ -4925,6 +4984,10 @@ dict_close(void)
mem_free(dict_sys);
dict_sys = NULL;
+
+ for (i = 0; i < DICT_INDEX_STAT_MUTEX_SIZE; i++) {
+ mutex_free(&dict_index_stat_mutex[i]);
+ }
}
/*************************************************************************
@@ -4939,7 +5002,7 @@ dict_table_set_corrupt_by_space(
dict_table_t* table;
ibool found = FALSE;
- ut_a(space_id != 0 && space_id < SRV_LOG_SPACE_FIRST_ID);
+ ut_a(!trx_sys_sys_space(space_id) && space_id < SRV_LOG_SPACE_FIRST_ID);
if (need_mutex)
mutex_enter(&(dict_sys->mutex));
diff --git a/storage/xtradb/dict/dict0load.c b/storage/xtradb/dict/dict0load.c
index 46cce5050cd..ba0374595ce 100644
--- a/storage/xtradb/dict/dict0load.c
+++ b/storage/xtradb/dict/dict0load.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -260,7 +260,7 @@ dict_sys_tables_get_flags(
return(0);
}
- field = rec_get_nth_field_old(rec, 4, &len);
+ field = rec_get_nth_field_old(rec, 4/*N_COLS*/, &len);
n_cols = mach_read_from_4(field);
if (UNIV_UNLIKELY(!(n_cols & 0x80000000UL))) {
@@ -390,15 +390,35 @@ loop:
mtr_commit(&mtr);
- if (space_id != 0 && in_crash_recovery) {
+ if (trx_sys_sys_space(space_id)) {
+ /* The system tablespace always exists. */
+ } else if (in_crash_recovery) {
/* Check that the tablespace (the .ibd file) really
- exists; print a warning to the .err log if not */
-
- fil_space_for_table_exists_in_mem(space_id, name,
- FALSE, TRUE, TRUE);
- }
+ exists; print a warning to the .err log if not.
+ Do not print warnings for temporary tables. */
+ ibool is_temp;
+
+ field = rec_get_nth_field_old(rec, 4, &len);
+ if (0x80000000UL & mach_read_from_4(field)) {
+ /* ROW_FORMAT=COMPACT: read the is_temp
+ flag from SYS_TABLES.MIX_LEN. */
+ field = rec_get_nth_field_old(rec, 7, &len);
+ is_temp = mach_read_from_4(field)
+ & DICT_TF2_TEMPORARY;
+ } else {
+ /* For tables created with old versions
+ of InnoDB, SYS_TABLES.MIX_LEN may contain
+ garbage. Such tables would always be
+ in ROW_FORMAT=REDUNDANT. Pretend that
+ all such tables are non-temporary. That is,
+ do not suppress error printouts about
+ temporary tables not being found. */
+ is_temp = FALSE;
+ }
- if (space_id != 0 && !in_crash_recovery) {
+ fil_space_for_table_exists_in_mem(
+ space_id, name, is_temp, TRUE, !is_temp);
+ } else {
/* It is a normal database startup: create the space
object and check that the .ibd file exists. */
@@ -878,7 +898,7 @@ err_exit:
space = mach_read_from_4(field);
/* Check if the tablespace exists and has the right name */
- if (space != 0) {
+ if (!trx_sys_sys_space(space)) {
flags = dict_sys_tables_get_flags(rec);
if (UNIV_UNLIKELY(flags == ULINT_UNDEFINED)) {
@@ -894,43 +914,72 @@ err_exit:
(ulong) flags);
goto err_exit;
}
+ } else {
+ flags = 0;
+ }
- if (fil_space_for_table_exists_in_mem(space, name, FALSE,
- FALSE, FALSE)) {
- /* Ok; (if we did a crash recovery then the tablespace
- can already be in the memory cache) */
- } else {
- /* In >= 4.1.9, InnoDB scans the data dictionary also
- at a normal mysqld startup. It is an error if the
- space object does not exist in memory. */
+ ut_a(name_of_col_is(sys_tables, sys_index, 4, "N_COLS"));
+ field = rec_get_nth_field_old(rec, 4, &len);
+ n_cols = mach_read_from_4(field);
+
+ /* The high-order bit of N_COLS is the "compact format" flag.
+ For tables in that format, MIX_LEN may hold additional flags. */
+ if (n_cols & 0x80000000UL) {
+ ulint flags2;
+
+ flags |= DICT_TF_COMPACT;
+
+ ut_a(name_of_col_is(sys_tables, sys_index, 7, "MIX_LEN"));
+ field = rec_get_nth_field_old(rec, 7, &len);
+
+ flags2 = mach_read_from_4(field);
+
+ if (flags2 & (~0 << (DICT_TF2_BITS - DICT_TF2_SHIFT))) {
+ ut_print_timestamp(stderr);
+ fputs(" InnoDB: Warning: table ", stderr);
+ ut_print_filename(stderr, name);
+ fprintf(stderr, "\n"
+ "InnoDB: in InnoDB data dictionary"
+ " has unknown flags %lx.\n",
+ (ulong) flags2);
+
+ flags2 &= ~(~0 << (DICT_TF2_BITS - DICT_TF2_SHIFT));
+ }
+
+ flags |= flags2 << DICT_TF2_SHIFT;
+ }
+
+ /* See if the tablespace is available. */
+ if (trx_sys_sys_space(space)) {
+ /* The system tablespace is always available. */
+ } else if (!fil_space_for_table_exists_in_mem(
+ space, name,
+ (flags >> DICT_TF2_SHIFT) & DICT_TF2_TEMPORARY,
+ FALSE, FALSE)) {
+
+ if ((flags >> DICT_TF2_SHIFT) & DICT_TF2_TEMPORARY) {
+ /* Do not bother to retry opening temporary tables. */
+ ibd_file_missing = TRUE;
+ } else {
ut_print_timestamp(stderr);
fprintf(stderr,
- " InnoDB: error: space object of table %s,\n"
+ " InnoDB: error: space object of table");
+ ut_print_filename(stderr, name);
+ fprintf(stderr, ",\n"
"InnoDB: space id %lu did not exist in memory."
" Retrying an open.\n",
- name, (ulong)space);
+ (ulong) space);
/* Try to open the tablespace */
if (!fil_open_single_table_tablespace(
- TRUE, space, flags, name)) {
- /* We failed to find a sensible tablespace
- file */
+ TRUE, space,
+ flags & ~(~0 << DICT_TF_BITS), name)) {
+ /* We failed to find a sensible
+ tablespace file */
ibd_file_missing = TRUE;
}
}
- } else {
- flags = 0;
- }
-
- ut_a(name_of_col_is(sys_tables, sys_index, 4, "N_COLS"));
-
- field = rec_get_nth_field_old(rec, 4, &len);
- n_cols = mach_read_from_4(field);
-
- /* The high-order bit of N_COLS is the "compact format" flag. */
- if (n_cols & 0x80000000UL) {
- flags |= DICT_TF_COMPACT;
}
table = dict_mem_table_create(name, space, n_cols & ~0x80000000UL,
diff --git a/storage/xtradb/dict/dict0mem.c b/storage/xtradb/dict/dict0mem.c
index acf550befad..388d46568ac 100644
--- a/storage/xtradb/dict/dict0mem.c
+++ b/storage/xtradb/dict/dict0mem.c
@@ -59,7 +59,7 @@ dict_mem_table_create(
mem_heap_t* heap;
ut_ad(name);
- ut_a(!(flags & (~0 << DICT_TF_BITS)));
+ ut_a(!(flags & (~0 << DICT_TF2_BITS)));
heap = mem_heap_create(DICT_HEAP_SIZE);
diff --git a/storage/xtradb/fil/fil0fil.c b/storage/xtradb/fil/fil0fil.c
index 69316d62c53..bb1eed2a1e3 100644
--- a/storage/xtradb/fil/fil0fil.c
+++ b/storage/xtradb/fil/fil0fil.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -38,6 +38,7 @@ Created 10/25/1995 Heikki Tuuri
#include "mtr0mtr.h"
#include "mtr0log.h"
#include "dict0dict.h"
+#include "page0page.h"
#include "page0zip.h"
#include "trx0trx.h"
#include "trx0sys.h"
@@ -675,14 +676,14 @@ fil_node_open_file(
size_bytes = (((ib_uint64_t)size_high) << 32)
+ (ib_uint64_t)size_low;
#ifdef UNIV_HOTBACKUP
- if (space->id == 0) {
+ if (trx_sys_sys_space(space->id)) {
node->size = (ulint) (size_bytes / UNIV_PAGE_SIZE);
os_file_close(node->handle);
goto add_size;
}
#endif /* UNIV_HOTBACKUP */
ut_a(space->purpose != FIL_LOG);
- ut_a(space->id != 0);
+ ut_a(!trx_sys_sys_space(space->id));
if (size_bytes < FIL_IBD_FILE_INITIAL_SIZE * UNIV_PAGE_SIZE) {
fprintf(stderr,
@@ -728,7 +729,7 @@ fil_node_open_file(
}
if (UNIV_UNLIKELY(space_id == ULINT_UNDEFINED
- || space_id == 0)) {
+ || trx_sys_sys_space(space_id))) {
fprintf(stderr,
"InnoDB: Error: tablespace id %lu"
" in file %s is not sensible\n",
@@ -790,7 +791,7 @@ add_size:
system->n_open++;
- if (space->purpose == FIL_TABLESPACE && space->id != 0) {
+ if (space->purpose == FIL_TABLESPACE && !trx_sys_sys_space(space->id)) {
/* Put the node to the LRU list */
UT_LIST_ADD_FIRST(LRU, system->LRU, node);
}
@@ -823,7 +824,7 @@ fil_node_close_file(
ut_a(system->n_open > 0);
system->n_open--;
- if (node->space->purpose == FIL_TABLESPACE && node->space->id != 0) {
+ if (node->space->purpose == FIL_TABLESPACE && !trx_sys_sys_space(node->space->id)) {
ut_a(UT_LIST_GET_LEN(system->LRU) > 0);
/* The node is in the LRU list, remove it */
@@ -909,7 +910,7 @@ fil_mutex_enter_and_prepare_for_io(
retry:
mutex_enter(&fil_system->mutex);
- if (space_id == 0 || space_id >= SRV_LOG_SPACE_FIRST_ID) {
+ if (trx_sys_sys_space(space_id) || space_id >= SRV_LOG_SPACE_FIRST_ID) {
/* We keep log files and system tablespace files always open;
this is important in preventing deadlocks in this module, as
a page read completion often performs another read from the
@@ -1104,10 +1105,13 @@ fil_space_create(
fil_space_t* space;
/* The tablespace flags (FSP_SPACE_FLAGS) should be 0 for
- ROW_FORMAT=COMPACT (table->flags == DICT_TF_COMPACT) and
+ ROW_FORMAT=COMPACT
+ ((table->flags & ~(~0 << DICT_TF_BITS)) == DICT_TF_COMPACT) and
ROW_FORMAT=REDUNDANT (table->flags == 0). For any other
- format, the tablespace flags should equal table->flags. */
+ format, the tablespace flags should equal
+ (table->flags & ~(~0 << DICT_TF_BITS)). */
ut_a(flags != DICT_TF_COMPACT);
+ ut_a(!(flags & (~0UL << DICT_TF_BITS)));
try_again:
/*printf(
@@ -1136,7 +1140,7 @@ try_again:
" tablespace memory cache!\n",
(ulong) space->id);
- if (id == 0 || purpose != FIL_TABLESPACE) {
+ if (trx_sys_sys_space(id) || purpose != FIL_TABLESPACE) {
mutex_exit(&fil_system->mutex);
@@ -1530,7 +1534,7 @@ fil_init(
fil_system->max_n_open = max_n_open;
fil_system->modification_counter = 0;
- fil_system->max_assigned_id = 0;
+ fil_system->max_assigned_id = TRX_SYS_SPACE_MAX;
fil_system->tablespace_version = 0;
@@ -1557,7 +1561,7 @@ fil_open_log_and_system_tablespace_files(void)
space = UT_LIST_GET_FIRST(fil_system->space_list);
while (space != NULL) {
- if (space->purpose != FIL_TABLESPACE || space->id == 0) {
+ if (space->purpose != FIL_TABLESPACE || trx_sys_sys_space(space->id)) {
node = UT_LIST_GET_FIRST(space->chain);
while (node != NULL) {
@@ -2591,10 +2595,13 @@ fil_create_new_single_table_tablespace(
ut_a(size >= FIL_IBD_FILE_INITIAL_SIZE);
/* The tablespace flags (FSP_SPACE_FLAGS) should be 0 for
- ROW_FORMAT=COMPACT (table->flags == DICT_TF_COMPACT) and
+ ROW_FORMAT=COMPACT
+ ((table->flags & ~(~0 << DICT_TF_BITS)) == DICT_TF_COMPACT) and
ROW_FORMAT=REDUNDANT (table->flags == 0). For any other
- format, the tablespace flags should equal table->flags. */
+ format, the tablespace flags should equal
+ (table->flags & ~(~0 << DICT_TF_BITS)). */
ut_a(flags != DICT_TF_COMPACT);
+ ut_a(!(flags & (~0UL << DICT_TF_BITS)));
path = fil_make_ibd_name(tablename, is_temp);
@@ -2795,6 +2802,7 @@ fil_reset_too_high_lsns(
ib_int64_t offset;
ulint zip_size;
ibool success;
+ page_zip_des_t page_zip;
filepath = fil_make_ibd_name(name, FALSE);
@@ -2842,6 +2850,12 @@ fil_reset_too_high_lsns(
space_id = fsp_header_get_space_id(page);
zip_size = fsp_header_get_zip_size(page);
+ page_zip_des_init(&page_zip);
+ page_zip_set_size(&page_zip, zip_size);
+ if (zip_size) {
+ page_zip.data = page + UNIV_PAGE_SIZE;
+ }
+
ut_print_timestamp(stderr);
fprintf(stderr,
" InnoDB: Flush lsn in the tablespace file %lu"
@@ -2876,20 +2890,23 @@ fil_reset_too_high_lsns(
/* We have to reset the lsn */
if (zip_size) {
- memcpy(page + UNIV_PAGE_SIZE, page, zip_size);
+ memcpy(page_zip.data, page, zip_size);
buf_flush_init_for_writing(
- page, page + UNIV_PAGE_SIZE,
- current_lsn);
+ page, &page_zip, current_lsn);
+ success = os_file_write(
+ filepath, file, page_zip.data,
+ (ulint) offset & 0xFFFFFFFFUL,
+ (ulint) (offset >> 32), zip_size);
} else {
buf_flush_init_for_writing(
page, NULL, current_lsn);
+ success = os_file_write(
+ filepath, file, page,
+ (ulint)(offset & 0xFFFFFFFFUL),
+ (ulint)(offset >> 32),
+ UNIV_PAGE_SIZE);
}
- success = os_file_write(filepath, file, page,
- (ulint)(offset & 0xFFFFFFFFUL),
- (ulint)(offset >> 32),
- zip_size
- ? zip_size
- : UNIV_PAGE_SIZE);
+
if (!success) {
goto func_exit;
@@ -2965,10 +2982,13 @@ fil_open_single_table_tablespace(
filepath = fil_make_ibd_name(name, FALSE);
/* The tablespace flags (FSP_SPACE_FLAGS) should be 0 for
- ROW_FORMAT=COMPACT (table->flags == DICT_TF_COMPACT) and
+ ROW_FORMAT=COMPACT
+ ((table->flags & ~(~0 << DICT_TF_BITS)) == DICT_TF_COMPACT) and
ROW_FORMAT=REDUNDANT (table->flags == 0). For any other
- format, the tablespace flags should equal table->flags. */
+ format, the tablespace flags should equal
+ (table->flags & ~(~0 << DICT_TF_BITS)). */
ut_a(flags != DICT_TF_COMPACT);
+ ut_a(!(flags & (~0UL << DICT_TF_BITS)));
file = os_file_create_simple_no_error_handling(
filepath, OS_FILE_OPEN, OS_FILE_READ_WRITE, &success);
@@ -3018,7 +3038,8 @@ fil_open_single_table_tablespace(
space_id = fsp_header_get_space_id(page);
space_flags = fsp_header_get_flags(page);
- if (srv_expand_import && (space_id != id || space_flags != flags)) {
+ if (srv_expand_import
+ && (space_id != id || space_flags != (flags & ~(~0 << DICT_TF_BITS)))) {
dulint old_id[31];
dulint new_id[31];
ulint root_page[31];
@@ -3359,7 +3380,8 @@ skip_write:
ut_free(buf2);
- if (UNIV_UNLIKELY(space_id != id || space_flags != flags)) {
+ if (UNIV_UNLIKELY(space_id != id
+ || space_flags != (flags & ~(~0 << DICT_TF_BITS)))) {
ut_print_timestamp(stderr);
fputs(" InnoDB: Error: tablespace id and flags in file ",
@@ -3598,7 +3620,7 @@ fil_load_single_table_tablespace(
}
#ifndef UNIV_HOTBACKUP
- if (space_id == ULINT_UNDEFINED || space_id == 0) {
+ if (space_id == ULINT_UNDEFINED || trx_sys_sys_space(space_id)) {
fprintf(stderr,
"InnoDB: Error: tablespace id %lu in file %s"
" is not sensible\n",
@@ -3607,7 +3629,7 @@ fil_load_single_table_tablespace(
goto func_exit;
}
#else
- if (space_id == ULINT_UNDEFINED || space_id == 0) {
+ if (space_id == ULINT_UNDEFINED || trx_sys_sys_space(space_id)) {
char* new_path;
fprintf(stderr,
@@ -3886,7 +3908,7 @@ fil_print_orphaned_tablespaces(void)
space = UT_LIST_GET_FIRST(fil_system->space_list);
while (space) {
- if (space->purpose == FIL_TABLESPACE && space->id != 0
+ if (space->purpose == FIL_TABLESPACE && !trx_sys_sys_space(space->id)
&& !space->mark) {
fputs("InnoDB: Warning: tablespace ", stderr);
ut_print_filename(stderr, space->name);
@@ -4461,7 +4483,7 @@ fil_node_prepare_for_io(
}
if (node->n_pending == 0 && space->purpose == FIL_TABLESPACE
- && space->id != 0) {
+ && !trx_sys_sys_space(space->id)) {
/* The node is in the LRU list, remove it */
ut_a(UT_LIST_GET_LEN(system->LRU) > 0);
@@ -4507,7 +4529,7 @@ fil_node_complete_io(
}
if (node->n_pending == 0 && node->space->purpose == FIL_TABLESPACE
- && node->space->id != 0) {
+ && !trx_sys_sys_space(node->space->id)) {
/* The node must be put back to the LRU list */
UT_LIST_ADD_FIRST(LRU, system->LRU, node);
}
@@ -5141,7 +5163,7 @@ fil_validate(void)
ut_a(fil_node->n_pending == 0);
ut_a(fil_node->open);
ut_a(fil_node->space->purpose == FIL_TABLESPACE);
- ut_a(fil_node->space->id != 0);
+ ut_a(!trx_sys_sys_space(fil_node->space->id));
fil_node = UT_LIST_GET_NEXT(LRU, fil_node);
}
@@ -5223,8 +5245,10 @@ void
fil_close(void)
/*===========*/
{
+#ifndef UNIV_HOTBACKUP
/* The mutex should already have been freed. */
ut_ad(fil_system->mutex.magic_n == 0);
+#endif /* !UNIV_HOTBACKUP */
hash_table_free(fil_system->spaces);
diff --git a/storage/xtradb/fsp/fsp0fsp.c b/storage/xtradb/fsp/fsp0fsp.c
index 19722623611..cd28186109f 100644
--- a/storage/xtradb/fsp/fsp0fsp.c
+++ b/storage/xtradb/fsp/fsp0fsp.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -48,7 +48,7 @@ Created 11/29/1995 Heikki Tuuri
# include "log0log.h"
#endif /* UNIV_HOTBACKUP */
#include "dict0mem.h"
-
+#include "trx0sys.h"
#define FSP_HEADER_OFFSET FIL_PAGE_DATA /* Offset of the space header
within a file page */
@@ -392,11 +392,11 @@ UNIV_INLINE
ibool
xdes_get_bit(
/*=========*/
- xdes_t* descr, /*!< in: descriptor */
- ulint bit, /*!< in: XDES_FREE_BIT or XDES_CLEAN_BIT */
- ulint offset, /*!< in: page offset within extent:
- 0 ... FSP_EXTENT_SIZE - 1 */
- mtr_t* mtr) /*!< in: mtr */
+ const xdes_t* descr, /*!< in: descriptor */
+ ulint bit, /*!< in: XDES_FREE_BIT or XDES_CLEAN_BIT */
+ ulint offset, /*!< in: page offset within extent:
+ 0 ... FSP_EXTENT_SIZE - 1 */
+ mtr_t* mtr) /*!< in: mtr */
{
ulint index;
ulint byte_index;
@@ -533,8 +533,8 @@ UNIV_INLINE
ulint
xdes_get_n_used(
/*============*/
- xdes_t* descr, /*!< in: descriptor */
- mtr_t* mtr) /*!< in: mtr */
+ const xdes_t* descr, /*!< in: descriptor */
+ mtr_t* mtr) /*!< in: mtr */
{
ulint i;
ulint count = 0;
@@ -557,8 +557,8 @@ UNIV_INLINE
ibool
xdes_is_free(
/*=========*/
- xdes_t* descr, /*!< in: descriptor */
- mtr_t* mtr) /*!< in: mtr */
+ const xdes_t* descr, /*!< in: descriptor */
+ mtr_t* mtr) /*!< in: mtr */
{
if (0 == xdes_get_n_used(descr, mtr)) {
@@ -575,8 +575,8 @@ UNIV_INLINE
ibool
xdes_is_full(
/*=========*/
- xdes_t* descr, /*!< in: descriptor */
- mtr_t* mtr) /*!< in: mtr */
+ const xdes_t* descr, /*!< in: descriptor */
+ mtr_t* mtr) /*!< in: mtr */
{
if (FSP_EXTENT_SIZE == xdes_get_n_used(descr, mtr)) {
@@ -592,7 +592,7 @@ UNIV_INLINE
void
xdes_set_state(
/*===========*/
- xdes_t* descr, /*!< in: descriptor */
+ xdes_t* descr, /*!< in/out: descriptor */
ulint state, /*!< in: state to set */
mtr_t* mtr) /*!< in: mtr handle */
{
@@ -611,8 +611,8 @@ UNIV_INLINE
ulint
xdes_get_state(
/*===========*/
- xdes_t* descr, /*!< in: descriptor */
- mtr_t* mtr) /*!< in: mtr handle */
+ const xdes_t* descr, /*!< in: descriptor */
+ mtr_t* mtr) /*!< in: mtr handle */
{
ulint state;
@@ -708,7 +708,7 @@ UNIV_INLINE
xdes_t*
xdes_get_descriptor_with_space_hdr(
/*===============================*/
- fsp_header_t* sp_header,/*!< in: space header, x-latched */
+ fsp_header_t* sp_header,/*!< in/out: space header, x-latched */
ulint space, /*!< in: space id */
ulint offset, /*!< in: page offset;
if equal to the free limit,
@@ -878,14 +878,10 @@ fsp_init_file_page_low(
return;
}
-#ifdef UNIV_BASIC_LOG_DEBUG
- memset(page, 0xff, UNIV_PAGE_SIZE);
-#endif
+ memset(page, 0, UNIV_PAGE_SIZE);
mach_write_to_4(page + FIL_PAGE_OFFSET, buf_block_get_page_no(block));
- memset(page + FIL_PAGE_LSN, 0, 8);
mach_write_to_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID,
buf_block_get_space(block));
- memset(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM, 0, 8);
}
#ifndef UNIV_HOTBACKUP
@@ -1013,10 +1009,10 @@ fsp_header_init(
flst_init(header + FSP_SEG_INODES_FREE, mtr);
mlog_write_dulint(header + FSP_SEG_ID, ut_dulint_create(0, 1), mtr);
- if (space == 0) {
+ if (space == TRX_SYS_SPACE || space == TRX_DOUBLEWRITE_SPACE) {
fsp_fill_free_list(FALSE, space, header, mtr);
btr_create(DICT_CLUSTERED | DICT_UNIVERSAL | DICT_IBUF,
- 0, 0, ut_dulint_add(DICT_IBUF_ID_MIN, space),
+ space, 0, ut_dulint_add(DICT_IBUF_ID_MIN, space),
dict_ind_redundant, mtr);
} else {
fsp_fill_free_list(TRUE, space, header, mtr);
@@ -1351,7 +1347,7 @@ fsp_fill_free_list(
descriptor page and ibuf bitmap page;
then we do not allocate more extents */
ulint space, /*!< in: space */
- fsp_header_t* header, /*!< in: space header */
+ fsp_header_t* header, /*!< in/out: space header */
mtr_t* mtr) /*!< in: mtr */
{
ulint limit;
diff --git a/storage/xtradb/ha/ha0ha.c b/storage/xtradb/ha/ha0ha.c
index cb5e541b55d..9d9d341ad39 100644
--- a/storage/xtradb/ha/ha0ha.c
+++ b/storage/xtradb/ha/ha0ha.c
@@ -101,6 +101,8 @@ ha_clear(
ulint i;
ulint n;
+ ut_ad(table);
+ ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
#ifdef UNIV_SYNC_DEBUG
ut_ad(rw_lock_own(&btr_search_latch, RW_LOCK_EXCLUSIVE));
#endif /* UNIV_SYNC_DEBUG */
@@ -146,7 +148,9 @@ ha_insert_for_fold_func(
ha_node_t* prev_node;
ulint hash;
- ut_ad(table && data);
+ ut_ad(data);
+ ut_ad(table);
+ ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
ut_a(block->frame == page_align(data));
#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
@@ -237,6 +241,8 @@ ha_delete_hash_node(
hash_table_t* table, /*!< in: hash table */
ha_node_t* del_node) /*!< in: node to be deleted */
{
+ ut_ad(table);
+ ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
# ifndef UNIV_HOTBACKUP
if (table->adaptive) {
@@ -267,6 +273,8 @@ ha_search_and_update_if_found_func(
{
ha_node_t* node;
+ ut_ad(table);
+ ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
ASSERT_HASH_MUTEX_OWN(table, fold);
#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
ut_a(new_block->frame == page_align(new_data));
@@ -304,6 +312,8 @@ ha_remove_all_nodes_to_page(
{
ha_node_t* node;
+ ut_ad(table);
+ ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
ASSERT_HASH_MUTEX_OWN(table, fold);
node = ha_chain_get_first(table, fold);
@@ -353,6 +363,8 @@ ha_validate(
ibool ok = TRUE;
ulint i;
+ ut_ad(table);
+ ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
ut_a(start_index <= end_index);
ut_a(start_index < hash_get_n_cells(table));
ut_a(end_index < hash_get_n_cells(table));
@@ -404,6 +416,8 @@ builds, see http://bugs.mysql.com/36941 */
#endif /* PRINT_USED_CELLS */
ulint n_bufs;
+ ut_ad(table);
+ ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
#ifdef PRINT_USED_CELLS
for (i = 0; i < hash_get_n_cells(table); i++) {
diff --git a/storage/xtradb/ha/hash0hash.c b/storage/xtradb/ha/hash0hash.c
index 2800d7793f8..30c304dafcd 100644
--- a/storage/xtradb/ha/hash0hash.c
+++ b/storage/xtradb/ha/hash0hash.c
@@ -119,7 +119,7 @@ hash_create(
table->heaps = NULL;
#endif /* !UNIV_HOTBACKUP */
table->heap = NULL;
- table->magic_n = HASH_TABLE_MAGIC_N;
+ ut_d(table->magic_n = HASH_TABLE_MAGIC_N);
/* Initialize the cell array */
hash_table_clear(table);
@@ -135,6 +135,8 @@ hash_table_free(
/*============*/
hash_table_t* table) /*!< in, own: hash table */
{
+ ut_ad(table);
+ ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
#ifndef UNIV_HOTBACKUP
ut_a(table->mutexes == NULL);
#endif /* !UNIV_HOTBACKUP */
@@ -160,6 +162,8 @@ hash_create_mutexes_func(
{
ulint i;
+ ut_ad(table);
+ ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
ut_a(n_mutexes > 0);
ut_a(ut_is_2pow(n_mutexes));
diff --git a/storage/xtradb/handler/ha_innodb.cc b/storage/xtradb/handler/ha_innodb.cc
index 5a5e04bc9d7..56688137cd6 100644
--- a/storage/xtradb/handler/ha_innodb.cc
+++ b/storage/xtradb/handler/ha_innodb.cc
@@ -1,7 +1,8 @@
/*****************************************************************************
-Copyright (c) 2000, 2009, MySQL AB & Innobase Oy. All Rights Reserved.
+Copyright (c) 2000, 2010, MySQL AB & Innobase Oy. All Rights Reserved.
Copyright (c) 2008, 2009 Google Inc.
+Copyright (c) 2009, Percona Inc.
Portions of this file contain modifications contributed and copyrighted by
Google, Inc. Those modifications are gratefully acknowledged and are described
@@ -9,6 +10,13 @@ briefly in the InnoDB documentation. The contributions by Google are
incorporated with their permission, and subject to the conditions contained in
the file COPYING.Google.
+Portions of this file contain modifications contributed and copyrighted
+by Percona Inc.. Those modifications are
+gratefully acknowledged and are described briefly in the InnoDB
+documentation. The contributions by Percona Inc. are incorporated with
+their permission, and subject to the conditions contained in the file
+COPYING.Percona.
+
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
Foundation; version 2 of the License.
@@ -22,32 +30,6 @@ this program; if not, write to the Free Software Foundation, Inc., 59 Temple
Place, Suite 330, Boston, MA 02111-1307 USA
*****************************************************************************/
-/***********************************************************************
-
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
-Copyright (c) 2009, Percona Inc.
-
-Portions of this file contain modifications contributed and copyrighted
-by Percona Inc.. Those modifications are
-gratefully acknowledged and are described briefly in the InnoDB
-documentation. The contributions by Percona Inc. are incorporated with
-their permission, and subject to the conditions contained in the file
-COPYING.Percona.
-
-This program is free software; you can redistribute it and/or modify it
-under the terms of the GNU General Public License as published by the
-Free Software Foundation; version 2 of the License.
-
-This program is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
-Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-
-***********************************************************************/
/* TODO list for the InnoDB handler in 5.0:
- Remove the flag trx->active_trans and look at trx->conc_state
@@ -141,7 +123,6 @@ static ulong commit_threads = 0;
static pthread_mutex_t commit_threads_m;
static pthread_cond_t commit_cond;
static pthread_mutex_t commit_cond_m;
-static pthread_mutex_t analyze_mutex;
static bool innodb_inited = 0;
#define INSIDE_HA_INNOBASE_CC
@@ -184,6 +165,7 @@ static char* innobase_data_file_path = NULL;
static char* innobase_log_group_home_dir = NULL;
static char* innobase_file_format_name = NULL;
static char* innobase_change_buffering = NULL;
+static char* innobase_doublewrite_file = NULL;
/* Note: This variable can be set to on/off and any of the supported
file formats in the configuration file, but can only be set to any
@@ -356,7 +338,7 @@ static MYSQL_THDVAR_ULONG(lock_wait_timeout, PLUGIN_VAR_RQCMDARG,
static handler *innobase_create_handler(handlerton *hton,
- TABLE_SHARE *table,
+ TABLE_SHARE *table,
MEM_ROOT *mem_root)
{
return new (mem_root) ha_innobase(hton, table);
@@ -469,8 +451,9 @@ static
int
innobase_start_trx_and_assign_read_view(
/*====================================*/
- handlerton* hton, /*!< in: Innodb handlerton */
- THD* thd); /*!< in: MySQL thread handle of the user for whom
+ /* out: 0 */
+ handlerton* hton, /* in: Innodb handlerton */
+ THD* thd); /* in: MySQL thread handle of the user for whom
the transaction should be committed */
/****************************************************************//**
Flushes InnoDB logs to disk and makes a checkpoint. Really, a commit flushes
@@ -552,6 +535,8 @@ static SHOW_VAR innodb_status_variables[]= {
(char*) &export_vars.innodb_data_written, SHOW_LONG},
{"dblwr_pages_written",
(char*) &export_vars.innodb_dblwr_pages_written, SHOW_LONG},
+ {"deadlocks",
+ (char*) &export_vars.innodb_deadlocks, SHOW_LONG},
{"dblwr_writes",
(char*) &export_vars.innodb_dblwr_writes, SHOW_LONG},
{"dict_tables",
@@ -1893,6 +1878,19 @@ trx_is_interrupted(
return(trx && trx->mysql_thd && thd_killed((THD*) trx->mysql_thd));
}
+/**********************************************************************//**
+Determines if the currently running transaction is in strict mode.
+@return TRUE if strict */
+extern "C" UNIV_INTERN
+ibool
+trx_is_strict(
+/*==========*/
+ trx_t* trx) /*!< in: transaction */
+{
+ return(trx && trx->mysql_thd
+ && THDVAR((THD*) trx->mysql_thd, strict_mode));
+}
+
/**************************************************************//**
Resets some fields of a prebuilt struct. The template is used in fast
retrieval of just those column values MySQL needs in its processing. */
@@ -2229,6 +2227,8 @@ mem_free_and_error:
goto error;
}
+ srv_doublewrite_file = innobase_doublewrite_file;
+
srv_extra_undoslots = (ibool) innobase_extra_undoslots;
/* -------------- Log files ---------------------------*/
@@ -2327,7 +2327,7 @@ mem_free_and_error:
}
sql_print_error("InnoDB: invalid value "
- "innodb_file_format_check=%s",
+ "innodb_change_buffering=%s",
innobase_change_buffering);
goto mem_free_and_error;
}
@@ -2366,7 +2366,6 @@ innobase_change_buffering_inited_ok:
srv_force_recovery = (ulint) innobase_force_recovery;
- srv_fast_recovery = (ibool) innobase_fast_recovery;
srv_recovery_stats = (ibool) innobase_recovery_stats;
srv_use_doublewrite_buf = (ibool) innobase_use_doublewrite;
@@ -2496,7 +2495,6 @@ skip_overwrite:
pthread_mutex_init(&prepare_commit_mutex, MY_MUTEX_INIT_FAST);
pthread_mutex_init(&commit_threads_m, MY_MUTEX_INIT_FAST);
pthread_mutex_init(&commit_cond_m, MY_MUTEX_INIT_FAST);
- pthread_mutex_init(&analyze_mutex, MY_MUTEX_INIT_FAST);
pthread_cond_init(&commit_cond, NULL);
innodb_inited= 1;
#ifdef MYSQL_DYNAMIC_PLUGIN
@@ -2551,7 +2549,6 @@ innobase_end(
pthread_mutex_destroy(&prepare_commit_mutex);
pthread_mutex_destroy(&commit_threads_m);
pthread_mutex_destroy(&commit_cond_m);
- pthread_mutex_destroy(&analyze_mutex);
pthread_cond_destroy(&commit_cond);
}
@@ -2588,10 +2585,7 @@ innobase_alter_table_flags(
{
return(HA_ONLINE_ADD_INDEX_NO_WRITES
| HA_ONLINE_DROP_INDEX_NO_WRITES
- /* Current InnoDB doesn't sort unique indexes along mysqld's order
- It is dangerous to use index. So it is disabled until
- the bug http://bugs.mysql.com/47622 */
- /* | HA_ONLINE_ADD_UNIQUE_INDEX_NO_WRITES */
+ | HA_ONLINE_ADD_UNIQUE_INDEX_NO_WRITES
| HA_ONLINE_DROP_UNIQUE_INDEX_NO_WRITES
| HA_ONLINE_ADD_PK_INDEX_NO_WRITES);
}
@@ -3286,59 +3280,370 @@ normalize_table_name(
}
/********************************************************************//**
+Get the upper limit of the MySQL integral and floating-point type.
+@return maximum allowed value for the field */
+static
+ulonglong
+innobase_get_int_col_max_value(
+/*===========================*/
+ const Field* field) /*!< in: MySQL field */
+{
+ ulonglong max_value = 0;
+
+ switch(field->key_type()) {
+ /* TINY */
+ case HA_KEYTYPE_BINARY:
+ max_value = 0xFFULL;
+ break;
+ case HA_KEYTYPE_INT8:
+ max_value = 0x7FULL;
+ break;
+ /* SHORT */
+ case HA_KEYTYPE_USHORT_INT:
+ max_value = 0xFFFFULL;
+ break;
+ case HA_KEYTYPE_SHORT_INT:
+ max_value = 0x7FFFULL;
+ break;
+ /* MEDIUM */
+ case HA_KEYTYPE_UINT24:
+ max_value = 0xFFFFFFULL;
+ break;
+ case HA_KEYTYPE_INT24:
+ max_value = 0x7FFFFFULL;
+ break;
+ /* LONG */
+ case HA_KEYTYPE_ULONG_INT:
+ max_value = 0xFFFFFFFFULL;
+ break;
+ case HA_KEYTYPE_LONG_INT:
+ max_value = 0x7FFFFFFFULL;
+ break;
+ /* BIG */
+ case HA_KEYTYPE_ULONGLONG:
+ max_value = 0xFFFFFFFFFFFFFFFFULL;
+ break;
+ case HA_KEYTYPE_LONGLONG:
+ max_value = 0x7FFFFFFFFFFFFFFFULL;
+ break;
+ case HA_KEYTYPE_FLOAT:
+ /* We use the maximum as per IEEE754-2008 standard, 2^24 */
+ max_value = 0x1000000ULL;
+ break;
+ case HA_KEYTYPE_DOUBLE:
+ /* We use the maximum as per IEEE754-2008 standard, 2^53 */
+ max_value = 0x20000000000000ULL;
+ break;
+ default:
+ ut_error;
+ }
+
+ return(max_value);
+}
+
+/*******************************************************************//**
+This function checks whether the index column information
+is consistent between KEY info from mysql and that from innodb index.
+@return TRUE if all column types match. */
+static
+ibool
+innobase_match_index_columns(
+/*=========================*/
+ const KEY* key_info, /*!< in: Index info
+ from mysql */
+ const dict_index_t* index_info) /*!< in: Index info
+ from Innodb */
+{
+ const KEY_PART_INFO* key_part;
+ const KEY_PART_INFO* key_end;
+ const dict_field_t* innodb_idx_fld;
+ const dict_field_t* innodb_idx_fld_end;
+
+ DBUG_ENTER("innobase_match_index_columns");
+
+ /* Check whether user defined index column count matches */
+ if (key_info->key_parts != index_info->n_user_defined_cols) {
+ DBUG_RETURN(FALSE);
+ }
+
+ key_part = key_info->key_part;
+ key_end = key_part + key_info->key_parts;
+ innodb_idx_fld = index_info->fields;
+ innodb_idx_fld_end = index_info->fields + index_info->n_fields;
+
+ /* Check each index column's datatype. We do not check
+ column name because there exists case that index
+ column name got modified in mysql but such change does not
+ propagate to InnoDB.
+ One hidden assumption here is that the index column sequences
+ are matched up between those in mysql and Innodb. */
+ for (; key_part != key_end; ++key_part) {
+ ulint col_type;
+ ibool is_unsigned;
+ ulint mtype = innodb_idx_fld->col->mtype;
+
+ /* Need to translate to InnoDB column type before
+ comparison. */
+ col_type = get_innobase_type_from_mysql_type(&is_unsigned,
+ key_part->field);
+
+ /* Ignore Innodb specific system columns. */
+ while (mtype == DATA_SYS) {
+ innodb_idx_fld++;
+
+ if (innodb_idx_fld >= innodb_idx_fld_end) {
+ DBUG_RETURN(FALSE);
+ }
+ }
+
+ if (col_type != mtype) {
+ /* Column Type mismatches */
+ DBUG_RETURN(FALSE);
+ }
+
+ innodb_idx_fld++;
+ }
+
+ DBUG_RETURN(TRUE);
+}
+
+/*******************************************************************//**
+This function builds a translation table in INNOBASE_SHARE
+structure for fast index location with mysql array number from its
+table->key_info structure. This also provides the necessary translation
+between the key order in mysql key_info and Innodb ib_table->indexes if
+they are not fully matched with each other.
+Note we do not have any mutex protecting the translation table
+building based on the assumption that there is no concurrent
+index creation/drop and DMLs that requires index lookup. All table
+handle will be closed before the index creation/drop.
+@return TRUE if index translation table built successfully */
+static
+ibool
+innobase_build_index_translation(
+/*=============================*/
+ const TABLE* table, /*!< in: table in MySQL data
+ dictionary */
+ dict_table_t* ib_table, /*!< in: table in Innodb data
+ dictionary */
+ INNOBASE_SHARE* share) /*!< in/out: share structure
+ where index translation table
+ will be constructed in. */
+{
+ ulint mysql_num_index;
+ ulint ib_num_index;
+ dict_index_t** index_mapping;
+ ibool ret = TRUE;
+
+ DBUG_ENTER("innobase_build_index_translation");
+
+ mysql_num_index = table->s->keys;
+ ib_num_index = UT_LIST_GET_LEN(ib_table->indexes);
+
+ index_mapping = share->idx_trans_tbl.index_mapping;
+
+ /* If there exists inconsistency between MySQL and InnoDB dictionary
+ (metadata) information, the number of index defined in MySQL
+ could exceed that in InnoDB, do not build index translation
+ table in such case */
+ if (UNIV_UNLIKELY(ib_num_index < mysql_num_index)) {
+ ret = FALSE;
+ goto func_exit;
+ }
+
+ /* If index entry count is non-zero, nothing has
+ changed since last update, directly return TRUE */
+ if (share->idx_trans_tbl.index_count) {
+ /* Index entry count should still match mysql_num_index */
+ ut_a(share->idx_trans_tbl.index_count == mysql_num_index);
+ goto func_exit;
+ }
+
+ /* The number of index increased, rebuild the mapping table */
+ if (mysql_num_index > share->idx_trans_tbl.array_size) {
+ index_mapping = (dict_index_t**) my_realloc(index_mapping,
+ mysql_num_index *
+ sizeof(*index_mapping),
+ MYF(MY_ALLOW_ZERO_PTR));
+
+ if (!index_mapping) {
+ ret = FALSE;
+ goto func_exit;
+ }
+
+ share->idx_trans_tbl.array_size = mysql_num_index;
+ }
+
+
+ /* For each index in the mysql key_info array, fetch its
+ corresponding InnoDB index pointer into index_mapping
+ array. */
+ for (ulint count = 0; count < mysql_num_index; count++) {
+
+ /* Fetch index pointers into index_mapping according to mysql
+ index sequence */
+ index_mapping[count] = dict_table_get_index_on_name(
+ ib_table, table->key_info[count].name);
+
+ if (!index_mapping[count]) {
+ sql_print_error("Cannot find index %s in InnoDB "
+ "index dictionary.",
+ table->key_info[count].name);
+ ret = FALSE;
+ goto func_exit;
+ }
+
+ /* Double check fetched index has the same
+ column info as those in mysql key_info. */
+ if (!innobase_match_index_columns(&table->key_info[count],
+ index_mapping[count])) {
+ sql_print_error("Found index %s whose column info "
+ "does not match that of MySQL.",
+ table->key_info[count].name);
+ ret = FALSE;
+ goto func_exit;
+ }
+ }
+
+ /* Successfully built the translation table */
+ share->idx_trans_tbl.index_count = mysql_num_index;
+
+func_exit:
+ if (!ret) {
+ /* Build translation table failed. */
+ my_free(index_mapping, MYF(MY_ALLOW_ZERO_PTR));
+
+ share->idx_trans_tbl.array_size = 0;
+ share->idx_trans_tbl.index_count = 0;
+ index_mapping = NULL;
+ }
+
+ share->idx_trans_tbl.index_mapping = index_mapping;
+
+ DBUG_RETURN(ret);
+}
+
+/*******************************************************************//**
+This function uses index translation table to quickly locate the
+requested index structure.
+Note we do not have mutex protection for the index translatoin table
+access, it is based on the assumption that there is no concurrent
+translation table rebuild (fter create/drop index) and DMLs that
+require index lookup.
+@return dict_index_t structure for requested index. NULL if
+fail to locate the index structure. */
+static
+dict_index_t*
+innobase_index_lookup(
+/*==================*/
+ INNOBASE_SHARE* share, /*!< in: share structure for index
+ translation table. */
+ uint keynr) /*!< in: index number for the requested
+ index */
+{
+ if (!share->idx_trans_tbl.index_mapping
+ || keynr >= share->idx_trans_tbl.index_count) {
+ return(NULL);
+ }
+
+ return(share->idx_trans_tbl.index_mapping[keynr]);
+}
+
+/************************************************************************
Set the autoinc column max value. This should only be called once from
-ha_innobase::open(). Therefore there's no need for a covering lock.
-@return DB_SUCCESS or error code */
+ha_innobase::open(). Therefore there's no need for a covering lock. */
UNIV_INTERN
-ulint
+void
ha_innobase::innobase_initialize_autoinc()
/*======================================*/
{
- dict_index_t* index;
ulonglong auto_inc;
- const char* col_name;
- ulint error;
+ const Field* field = table->found_next_number_field;
- col_name = table->found_next_number_field->field_name;
- index = innobase_get_index(table->s->next_number_index);
+ if (field != NULL) {
+ auto_inc = innobase_get_int_col_max_value(field);
+ } else {
+ /* We have no idea what's been passed in to us as the
+ autoinc column. We set it to the 0, effectively disabling
+ updates to the table. */
+ auto_inc = 0;
- /* Execute SELECT MAX(col_name) FROM TABLE; */
- error = row_search_max_autoinc(index, col_name, &auto_inc);
+ ut_print_timestamp(stderr);
+ fprintf(stderr, " InnoDB: Unable to determine the AUTOINC "
+ "column name\n");
+ }
- switch (error) {
- case DB_SUCCESS:
+ if (srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE) {
+ /* If the recovery level is set so high that writes
+ are disabled we force the AUTOINC counter to 0
+ value effectively disabling writes to the table.
+ Secondly, we avoid reading the table in case the read
+ results in failure due to a corrupted table/index.
+
+ We will not return an error to the client, so that the
+ tables can be dumped with minimal hassle. If an error
+ were returned in this case, the first attempt to read
+ the table would fail and subsequent SELECTs would succeed. */
+ auto_inc = 0;
+ } else if (field == NULL) {
+ /* This is a far more serious error, best to avoid
+ opening the table and return failure. */
+ my_error(ER_AUTOINC_READ_FAILED, MYF(0));
+ } else {
+ dict_index_t* index;
+ const char* col_name;
+ ulonglong read_auto_inc;
+ ulint err;
- /* At the this stage we don't know the increment
- or the offset, so use default inrement of 1. */
- ++auto_inc;
- break;
+ update_thd(ha_thd());
- case DB_RECORD_NOT_FOUND:
- ut_print_timestamp(stderr);
- fprintf(stderr, " InnoDB: MySQL and InnoDB data "
- "dictionaries are out of sync.\n"
- "InnoDB: Unable to find the AUTOINC column %s in the "
- "InnoDB table %s.\n"
- "InnoDB: We set the next AUTOINC column value to the "
- "maximum possible value,\n"
- "InnoDB: in effect disabling the AUTOINC next value "
- "generation.\n"
- "InnoDB: You can either set the next AUTOINC value "
- "explicitly using ALTER TABLE\n"
- "InnoDB: or fix the data dictionary by recreating "
- "the table.\n",
- col_name, index->table->name);
-
- auto_inc = 0xFFFFFFFFFFFFFFFFULL;
- break;
+ ut_a(prebuilt->trx == thd_to_trx(user_thd));
- default:
- return(error);
+ col_name = field->field_name;
+ index = innobase_get_index(table->s->next_number_index);
+
+ /* Execute SELECT MAX(col_name) FROM TABLE; */
+ err = row_search_max_autoinc(index, col_name, &read_auto_inc);
+
+ switch (err) {
+ case DB_SUCCESS:
+ /* At the this stage we do not know the increment
+ or the offset, so use a default increment of 1. */
+ auto_inc = read_auto_inc + 1;
+ break;
+
+ case DB_RECORD_NOT_FOUND:
+ ut_print_timestamp(stderr);
+ fprintf(stderr, " InnoDB: MySQL and InnoDB data "
+ "dictionaries are out of sync.\n"
+ "InnoDB: Unable to find the AUTOINC column "
+ "%s in the InnoDB table %s.\n"
+ "InnoDB: We set the next AUTOINC column "
+ "value to 0,\n"
+ "InnoDB: in effect disabling the AUTOINC "
+ "next value generation.\n"
+ "InnoDB: You can either set the next "
+ "AUTOINC value explicitly using ALTER TABLE\n"
+ "InnoDB: or fix the data dictionary by "
+ "recreating the table.\n",
+ col_name, index->table->name);
+
+ /* This will disable the AUTOINC generation. */
+ auto_inc = 0;
+
+ /* We want the open to succeed, so that the user can
+ take corrective action. ie. reads should succeed but
+ updates should fail. */
+ err = DB_SUCCESS;
+ break;
+ default:
+ /* row_search_max_autoinc() should only return
+ one of DB_SUCCESS or DB_RECORD_NOT_FOUND. */
+ ut_error;
+ }
}
dict_table_autoinc_initialize(prebuilt->table, auto_inc);
-
- return(DB_SUCCESS);
}
/*****************************************************************//**
@@ -3491,6 +3796,11 @@ retry:
primary_key = table->s->primary_key;
key_used_on_scan = primary_key;
+ if (!innobase_build_index_translation(table, ib_table, share)) {
+ sql_print_error("Build InnoDB index translation table for"
+ " Table %s failed", name);
+ }
+
/* Allocate a buffer for a 'row reference'. A row reference is
a string of bytes of length ref_length which uniquely specifies
a row in our table. Note that MySQL may also compare two row
@@ -3498,31 +3808,86 @@ retry:
of length ref_length! */
if (!row_table_got_default_clust_index(ib_table)) {
- if (primary_key >= MAX_KEY) {
- sql_print_error("Table %s has a primary key in InnoDB data "
- "dictionary, but not in MySQL!", name);
- }
prebuilt->clust_index_was_generated = FALSE;
- /* MySQL allocates the buffer for ref. key_info->key_length
- includes space for all key columns + one byte for each column
- that may be NULL. ref_length must be as exact as possible to
- save space, because all row reference buffers are allocated
- based on ref_length. */
+ if (UNIV_UNLIKELY(primary_key >= MAX_KEY)) {
+ sql_print_error("Table %s has a primary key in "
+ "InnoDB data dictionary, but not "
+ "in MySQL!", name);
+
+ /* This mismatch could cause further problems
+ if not attended, bring this to the user's attention
+ by printing a warning in addition to log a message
+ in the errorlog */
+ push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+ ER_NO_SUCH_INDEX,
+ "InnoDB: Table %s has a "
+ "primary key in InnoDB data "
+ "dictionary, but not in "
+ "MySQL!", name);
+
+ /* If primary_key >= MAX_KEY, its (primary_key)
+ value could be out of bound if continue to index
+ into key_info[] array. Find InnoDB primary index,
+ and assign its key_length to ref_length.
+ In addition, since MySQL indexes are sorted starting
+ with primary index, unique index etc., initialize
+ ref_length to the first index key length in
+ case we fail to find InnoDB cluster index.
+
+ Please note, this will not resolve the primary
+ index mismatch problem, other side effects are
+ possible if users continue to use the table.
+ However, we allow this table to be opened so
+ that user can adopt necessary measures for the
+ mismatch while still being accessible to the table
+ date. */
+ ref_length = table->key_info[0].key_length;
+
+ /* Find correspoinding cluster index
+ key length in MySQL's key_info[] array */
+ for (ulint i = 0; i < table->s->keys; i++) {
+ dict_index_t* index;
+ index = innobase_get_index(i);
+ if (dict_index_is_clust(index)) {
+ ref_length =
+ table->key_info[i].key_length;
+ }
+ }
+ } else {
+ /* MySQL allocates the buffer for ref.
+ key_info->key_length includes space for all key
+ columns + one byte for each column that may be
+ NULL. ref_length must be as exact as possible to
+ save space, because all row reference buffers are
+ allocated based on ref_length. */
- ref_length = table->key_info[primary_key].key_length;
+ ref_length = table->key_info[primary_key].key_length;
+ }
} else {
if (primary_key != MAX_KEY) {
- sql_print_error("Table %s has no primary key in InnoDB data "
- "dictionary, but has one in MySQL! If you "
- "created the table with a MySQL version < "
- "3.23.54 and did not define a primary key, "
- "but defined a unique key with all non-NULL "
- "columns, then MySQL internally treats that "
- "key as the primary key. You can fix this "
- "error by dump + DROP + CREATE + reimport "
- "of the table.", name);
+ sql_print_error(
+ "Table %s has no primary key in InnoDB data "
+ "dictionary, but has one in MySQL! If you "
+ "created the table with a MySQL version < "
+ "3.23.54 and did not define a primary key, "
+ "but defined a unique key with all non-NULL "
+ "columns, then MySQL internally treats that "
+ "key as the primary key. You can fix this "
+ "error by dump + DROP + CREATE + reimport "
+ "of the table.", name);
+
+ /* This mismatch could cause further problems
+ if not attended, bring this to the user attention
+ by printing a warning in addition to log a message
+ in the errorlog */
+ push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+ ER_NO_SUCH_INDEX,
+ "InnoDB: Table %s has no "
+ "primary key in InnoDB data "
+ "dictionary, but has one in "
+ "MySQL!", name);
}
prebuilt->clust_index_was_generated = TRUE;
@@ -3564,8 +3929,6 @@ retry:
/* Only if the table has an AUTOINC column. */
if (prebuilt->table != NULL && table->found_next_number_field != NULL) {
- ulint error;
-
dict_table_autoinc_lock(prebuilt->table);
/* Since a table can already be "open" in InnoDB's internal
@@ -3574,8 +3937,7 @@ retry:
autoinc value from a previous MySQL open. */
if (dict_table_autoinc_read(prebuilt->table) == 0) {
- error = innobase_initialize_autoinc();
- ut_a(error == DB_SUCCESS);
+ innobase_initialize_autoinc();
}
dict_table_autoinc_unlock(prebuilt->table);
@@ -3866,6 +4228,11 @@ get_innobase_type_from_mysql_type(
case MYSQL_TYPE_BLOB:
case MYSQL_TYPE_LONG_BLOB:
return(DATA_BLOB);
+ case MYSQL_TYPE_NULL:
+ /* MySQL currently accepts "NULL" datatype, but will
+ reject such datatype in the next release. We will cope
+ with it and not trigger assertion failure in 5.1 */
+ break;
default:
ut_error;
}
@@ -4392,67 +4759,6 @@ skip_field:
}
/********************************************************************//**
-Get the upper limit of the MySQL integral and floating-point type. */
-UNIV_INTERN
-ulonglong
-ha_innobase::innobase_get_int_col_max_value(
-/*========================================*/
- const Field* field)
-{
- ulonglong max_value = 0;
-
- switch(field->key_type()) {
- /* TINY */
- case HA_KEYTYPE_BINARY:
- max_value = 0xFFULL;
- break;
- case HA_KEYTYPE_INT8:
- max_value = 0x7FULL;
- break;
- /* SHORT */
- case HA_KEYTYPE_USHORT_INT:
- max_value = 0xFFFFULL;
- break;
- case HA_KEYTYPE_SHORT_INT:
- max_value = 0x7FFFULL;
- break;
- /* MEDIUM */
- case HA_KEYTYPE_UINT24:
- max_value = 0xFFFFFFULL;
- break;
- case HA_KEYTYPE_INT24:
- max_value = 0x7FFFFFULL;
- break;
- /* LONG */
- case HA_KEYTYPE_ULONG_INT:
- max_value = 0xFFFFFFFFULL;
- break;
- case HA_KEYTYPE_LONG_INT:
- max_value = 0x7FFFFFFFULL;
- break;
- /* BIG */
- case HA_KEYTYPE_ULONGLONG:
- max_value = 0xFFFFFFFFFFFFFFFFULL;
- break;
- case HA_KEYTYPE_LONGLONG:
- max_value = 0x7FFFFFFFFFFFFFFFULL;
- break;
- case HA_KEYTYPE_FLOAT:
- /* We use the maximum as per IEEE754-2008 standard, 2^24 */
- max_value = 0x1000000ULL;
- break;
- case HA_KEYTYPE_DOUBLE:
- /* We use the maximum as per IEEE754-2008 standard, 2^53 */
- max_value = 0x20000000000000ULL;
- break;
- default:
- ut_error;
- }
-
- return(max_value);
-}
-
-/********************************************************************//**
This special handling is really to overcome the limitations of MySQL's
binlogging. We need to eliminate the non-determinism that will arise in
INSERT ... SELECT type of statements, since MySQL binlog only stores the
@@ -4681,11 +4987,17 @@ no_commit:
prebuilt->autoinc_error = DB_SUCCESS;
if ((error = update_auto_increment())) {
-
/* We don't want to mask autoinc overflow errors. */
- if (prebuilt->autoinc_error != DB_SUCCESS) {
- error = (int) prebuilt->autoinc_error;
+ /* Handle the case where the AUTOINC sub-system
+ failed during initialization. */
+ if (prebuilt->autoinc_error == DB_UNSUPPORTED) {
+ error_result = ER_AUTOINC_READ_FAILED;
+ /* Set the error message to report too. */
+ my_error(ER_AUTOINC_READ_FAILED, MYF(0));
+ goto func_exit;
+ } else if (prebuilt->autoinc_error != DB_SUCCESS) {
+ error = (int) prebuilt->autoinc_error;
goto report_error;
}
@@ -5166,7 +5478,7 @@ ha_innobase::unlock_row(void)
case ROW_READ_WITH_LOCKS:
if (!srv_locks_unsafe_for_binlog
&& prebuilt->trx->isolation_level
- != TRX_ISO_READ_COMMITTED) {
+ > TRX_ISO_READ_COMMITTED) {
break;
}
/* fall through */
@@ -5205,7 +5517,7 @@ ha_innobase::try_semi_consistent_read(bool yes)
if (yes
&& (srv_locks_unsafe_for_binlog
- || prebuilt->trx->isolation_level == TRX_ISO_READ_COMMITTED)) {
+ || prebuilt->trx->isolation_level <= TRX_ISO_READ_COMMITTED)) {
prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
} else {
prebuilt->row_read_type = ROW_READ_WITH_LOCKS;
@@ -5508,14 +5820,30 @@ ha_innobase::innobase_get_index(
DBUG_ENTER("innobase_get_index");
ha_statistic_increment(&SSV::ha_read_key_count);
- ut_ad(user_thd == ha_thd());
- ut_a(prebuilt->trx == thd_to_trx(user_thd));
-
if (keynr != MAX_KEY && table->s->keys > 0) {
key = table->key_info + keynr;
- index = dict_table_get_index_on_name(prebuilt->table,
- key->name);
+ index = innobase_index_lookup(share, keynr);
+
+ if (index) {
+ ut_a(ut_strcmp(index->name, key->name) == 0);
+ } else {
+ /* Can't find index with keynr in the translation
+ table. Only print message if the index translation
+ table exists */
+ if (share->idx_trans_tbl.index_mapping) {
+ sql_print_error("InnoDB could not find "
+ "index %s key no %u for "
+ "table %s through its "
+ "index translation table",
+ key ? key->name : "NULL",
+ keynr,
+ prebuilt->table->name);
+ }
+
+ index = dict_table_get_index_on_name(prebuilt->table,
+ key->name);
+ }
} else {
index = dict_table_get_first_index(prebuilt->table);
}
@@ -5581,7 +5909,7 @@ ha_innobase::change_active_index(
dtuple_set_n_fields(prebuilt->search_tuple, prebuilt->index->n_fields);
dict_index_copy_types(prebuilt->search_tuple, prebuilt->index,
- prebuilt->index->n_fields);
+ prebuilt->index->n_fields);
/* MySQL changes the active index for a handle also during some
queries, for example SELECT MAX(a), SUM(a) first retrieves the MAX()
@@ -6014,7 +6342,22 @@ create_table_def(
field = form->field[i];
col_type = get_innobase_type_from_mysql_type(&unsigned_type,
- field);
+ field);
+
+ if (!col_type) {
+ push_warning_printf(
+ (THD*) trx->mysql_thd,
+ MYSQL_ERROR::WARN_LEVEL_WARN,
+ ER_CANT_CREATE_TABLE,
+ "Error creating table '%s' with "
+ "column '%s'. Please check its "
+ "column type and try to re-create "
+ "the table with an appropriate "
+ "column type.",
+ table->name, (char*) field->field_name);
+ goto err_col;
+ }
+
if (field->null_ptr) {
nulls_allowed = 0;
} else {
@@ -6072,7 +6415,7 @@ create_table_def(
if (dict_col_name_is_reserved(field->field_name)){
my_error(ER_WRONG_COLUMN_NAME, MYF(0),
field->field_name);
-
+err_col:
dict_mem_table_free(table);
trx_commit_for_mysql(trx);
@@ -6095,9 +6438,11 @@ create_table_def(
if (error == DB_DUPLICATE_KEY) {
char buf[100];
- innobase_convert_identifier(buf, sizeof buf,
- table_name, strlen(table_name),
- trx->mysql_thd, TRUE);
+ char* buf_end = innobase_convert_identifier(
+ buf, sizeof buf - 1, table_name, strlen(table_name),
+ trx->mysql_thd, TRUE);
+
+ *buf_end = '\0';
my_error(ER_TABLE_EXISTS_ERROR, MYF(0), buf);
}
@@ -6699,6 +7044,10 @@ ha_innobase::create(
goto cleanup;
}
+ if (create_info->options & HA_LEX_CREATE_TMP_TABLE) {
+ flags |= DICT_TF2_TEMPORARY << DICT_TF2_SHIFT;
+ }
+
error = create_table_def(trx, form, norm_name,
create_info->options & HA_LEX_CREATE_TMP_TABLE ? name2 : NULL,
flags);
@@ -7230,10 +7579,15 @@ ha_innobase::records_in_range(
key = table->key_info + active_index;
- index = dict_table_get_index_on_name(prebuilt->table, key->name);
+ index = innobase_get_index(keynr);
- /* MySQL knows about this index and so we must be able to find it.*/
- ut_a(index);
+ /* There exists possibility of not being able to find requested
+ index due to inconsistency between MySQL and InoDB dictionary info.
+ Necessary message should have been printed in innobase_get_index() */
+ if (UNIV_UNLIKELY(!index)) {
+ n_rows = HA_POS_ERROR;
+ goto func_exit;
+ }
heap = mem_heap_create(2 * (key->key_parts * sizeof(dfield_t)
+ sizeof(dtuple_t)));
@@ -7278,6 +7632,7 @@ ha_innobase::records_in_range(
mem_heap_free(heap);
+func_exit:
my_free(key_val_buff2, MYF(0));
prebuilt->trx->op_info = (char*)"";
@@ -7429,6 +7784,7 @@ ha_innobase::info(
char path[FN_REFLEN];
os_file_stat_t stat_info;
+
DBUG_ENTER("info");
/* If we are forcing recovery at a high level, we will suppress
@@ -7591,13 +7947,29 @@ ha_innobase::info(
}
if (flag & HA_STATUS_CONST) {
- index = dict_table_get_first_index(ib_table);
+ /* Verify the number of index in InnoDB and MySQL
+ matches up. If prebuilt->clust_index_was_generated
+ holds, InnoDB defines GEN_CLUST_INDEX internally */
+ ulint num_innodb_index = UT_LIST_GET_LEN(ib_table->indexes)
+ - prebuilt->clust_index_was_generated;
- if (prebuilt->clust_index_was_generated) {
- index = dict_table_get_next_index(index);
+ if (table->s->keys != num_innodb_index) {
+ sql_print_error("Table %s contains %lu "
+ "indexes inside InnoDB, which "
+ "is different from the number of "
+ "indexes %u defined in the MySQL ",
+ ib_table->name, num_innodb_index,
+ table->s->keys);
}
for (i = 0; i < table->s->keys; i++) {
+ /* We could get index quickly through internal
+ index mapping with the index translation table.
+ The identity of index (match up index name with
+ that of table->key_info[i]) is already verified in
+ innobase_get_index(). */
+ index = innobase_get_index(i);
+
if (index == NULL) {
sql_print_error("Table %s contains fewer "
"indexes inside InnoDB than "
@@ -7626,6 +7998,8 @@ ha_innobase::info(
break;
}
+ dict_index_stat_mutex_enter(index);
+
if (index->stat_n_diff_key_vals[j + 1] == 0) {
rec_per_key = stats.records;
@@ -7634,6 +8008,8 @@ ha_innobase::info(
index->stat_n_diff_key_vals[j + 1]);
}
+ dict_index_stat_mutex_exit(index);
+
/* Since MySQL seems to favor table scans
too much over index searches, we pretend
index selectivity is 2 times better than
@@ -7649,8 +8025,6 @@ ha_innobase::info(
rec_per_key >= ~(ulong) 0 ? ~(ulong) 0 :
(ulong) rec_per_key;
}
-
- index = dict_table_get_next_index(index);
}
}
@@ -7694,15 +8068,9 @@ ha_innobase::analyze(
return(HA_ADMIN_CORRUPT);
}
- /* Serialize ANALYZE TABLE inside InnoDB, see
- Bug#38996 Race condition in ANALYZE TABLE */
- pthread_mutex_lock(&analyze_mutex);
-
/* Simply call ::info() with all the flags */
info(HA_STATUS_TIME | HA_STATUS_CONST | HA_STATUS_VARIABLE);
- pthread_mutex_unlock(&analyze_mutex);
-
if (share->ib_table->is_corrupt) {
return(HA_ADMIN_CORRUPT);
}
@@ -7736,8 +8104,13 @@ ha_innobase::check(
HA_CHECK_OPT* check_opt) /*!< in: check options, currently
ignored */
{
- ulint ret;
+ dict_index_t* index;
+ ulint n_rows;
+ ulint n_rows_in_table = ULINT_UNDEFINED;
+ ibool is_ok = TRUE;
+ ulint old_isolation_level;
+ DBUG_ENTER("ha_innobase::check");
DBUG_ASSERT(thd == ha_thd());
ut_a(prebuilt->trx);
ut_a(prebuilt->trx->magic_n == TRX_MAGIC_N);
@@ -7750,21 +8123,144 @@ ha_innobase::check(
build_template(prebuilt, NULL, table, ROW_MYSQL_WHOLE_ROW);
}
- ret = row_check_table_for_mysql(prebuilt);
+ if (prebuilt->table->ibd_file_missing) {
+ sql_print_error("InnoDB: Error:\n"
+ "InnoDB: MySQL is trying to use a table handle"
+ " but the .ibd file for\n"
+ "InnoDB: table %s does not exist.\n"
+ "InnoDB: Have you deleted the .ibd file"
+ " from the database directory under\n"
+ "InnoDB: the MySQL datadir, or have you"
+ " used DISCARD TABLESPACE?\n"
+ "InnoDB: Please refer to\n"
+ "InnoDB: " REFMAN "innodb-troubleshooting.html\n"
+ "InnoDB: how you can resolve the problem.\n",
+ prebuilt->table->name);
+ DBUG_RETURN(HA_ADMIN_CORRUPT);
+ }
- if (ret != DB_INTERRUPTED && share->ib_table->is_corrupt) {
- return(HA_ADMIN_CORRUPT);
+ prebuilt->trx->op_info = "checking table";
+
+ old_isolation_level = prebuilt->trx->isolation_level;
+
+ /* We must run the index record counts at an isolation level
+ >= READ COMMITTED, because a dirty read can see a wrong number
+ of records in some index; to play safe, we use always
+ REPEATABLE READ here */
+
+ prebuilt->trx->isolation_level = TRX_ISO_REPEATABLE_READ;
+
+ /* Enlarge the fatal lock wait timeout during CHECK TABLE. */
+ mutex_enter(&kernel_mutex);
+ srv_fatal_semaphore_wait_threshold += 7200; /* 2 hours */
+ mutex_exit(&kernel_mutex);
+
+ for (index = dict_table_get_first_index(prebuilt->table);
+ index != NULL;
+ index = dict_table_get_next_index(index)) {
+#if 0
+ fputs("Validating index ", stderr);
+ ut_print_name(stderr, trx, FALSE, index->name);
+ putc('\n', stderr);
+#endif
+
+ if (!btr_validate_index(index, prebuilt->trx)) {
+ is_ok = FALSE;
+ push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+ ER_NOT_KEYFILE,
+ "InnoDB: The B-tree of"
+ " index '%-.200s' is corrupted.",
+ index->name);
+ continue;
+ }
+
+ /* Instead of invoking change_active_index(), set up
+ a dummy template for non-locking reads, disabling
+ access to the clustered index. */
+ prebuilt->index = index;
+
+ prebuilt->index_usable = row_merge_is_index_usable(
+ prebuilt->trx, prebuilt->index);
+
+ if (UNIV_UNLIKELY(!prebuilt->index_usable)) {
+ push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+ HA_ERR_TABLE_DEF_CHANGED,
+ "InnoDB: Insufficient history for"
+ " index '%-.200s'",
+ index->name);
+ continue;
+ }
+
+ prebuilt->sql_stat_start = TRUE;
+ prebuilt->template_type = ROW_MYSQL_DUMMY_TEMPLATE;
+ prebuilt->n_template = 0;
+ prebuilt->need_to_access_clustered = FALSE;
+
+ dtuple_set_n_fields(prebuilt->search_tuple, 0);
+
+ prebuilt->select_lock_type = LOCK_NONE;
+
+ if (!row_check_index_for_mysql(prebuilt, index, &n_rows)) {
+ push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+ ER_NOT_KEYFILE,
+ "InnoDB: The B-tree of"
+ " index '%-.200s' is corrupted.",
+ index->name);
+ is_ok = FALSE;
+ }
+
+ if (thd_killed(user_thd)) {
+ break;
+ }
+
+#if 0
+ fprintf(stderr, "%lu entries in index %s\n", n_rows,
+ index->name);
+#endif
+
+ if (index == dict_table_get_first_index(prebuilt->table)) {
+ n_rows_in_table = n_rows;
+ } else if (n_rows != n_rows_in_table) {
+ push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+ ER_NOT_KEYFILE,
+ "InnoDB: Index '%-.200s'"
+ " contains %lu entries,"
+ " should be %lu.",
+ index->name,
+ (ulong) n_rows,
+ (ulong) n_rows_in_table);
+ is_ok = FALSE;
+ }
}
- switch (ret) {
- case DB_SUCCESS:
- return(HA_ADMIN_OK);
- case DB_INTERRUPTED:
+ /* Restore the original isolation level */
+ prebuilt->trx->isolation_level = old_isolation_level;
+
+ /* We validate also the whole adaptive hash index for all tables
+ at every CHECK TABLE */
+
+ if (!btr_search_validate()) {
+ push_warning(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+ ER_NOT_KEYFILE,
+ "InnoDB: The adaptive hash index is corrupted.");
+ is_ok = FALSE;
+ }
+
+ /* Restore the fatal lock wait timeout after CHECK TABLE. */
+ mutex_enter(&kernel_mutex);
+ srv_fatal_semaphore_wait_threshold -= 7200; /* 2 hours */
+ mutex_exit(&kernel_mutex);
+
+ prebuilt->trx->op_info = "";
+ if (thd_killed(user_thd)) {
my_error(ER_QUERY_INTERRUPTED, MYF(0));
- return(-1);
- default:
+ }
+
+ if (share->ib_table->is_corrupt) {
return(HA_ADMIN_CORRUPT);
}
+
+ DBUG_RETURN(is_ok ? HA_ADMIN_OK : HA_ADMIN_CORRUPT);
}
/*************************************************************//**
@@ -8428,7 +8924,9 @@ ha_innobase::external_lock(
if (trx->n_mysql_tables_in_use == 0) {
#ifdef EXTENDED_SLOWLOG
- increment_thd_innodb_stats(thd, trx->io_reads,
+ increment_thd_innodb_stats(thd,
+ (unsigned long long) ut_conv_dulint_to_longlong(trx->id),
+ trx->io_reads,
trx->io_read,
trx->io_reads_wait_timer,
trx->lock_que_wait_timer,
@@ -8608,8 +9106,8 @@ innodb_show_status(
mutex_enter(&srv_monitor_file_mutex);
rewind(srv_monitor_file);
- srv_printf_innodb_monitor(srv_monitor_file,
- &trx_list_start, &trx_list_end);
+ srv_printf_innodb_monitor(srv_monitor_file, FALSE,
+ &trx_list_start, &trx_list_end);
flen = ftell(srv_monitor_file);
os_file_set_eof(srv_monitor_file);
@@ -8666,19 +9164,25 @@ innodb_show_status(
}
/************************************************************************//**
-Implements the SHOW MUTEX STATUS command. . */
+Implements the SHOW MUTEX STATUS command.
+@return TRUE on failure, FALSE on success. */
static
bool
innodb_mutex_show_status(
/*=====================*/
- handlerton* hton, /*!< in: the innodb handlerton */
+ handlerton* hton, /*!< in: the innodb handlerton */
THD* thd, /*!< in: the MySQL query thread of the
caller */
- stat_print_fn* stat_print)
+ stat_print_fn* stat_print) /*!< in: function for printing
+ statistics */
{
char buf1[IO_SIZE], buf2[IO_SIZE];
mutex_t* mutex;
rw_lock_t* lock;
+ ulint block_mutex_oswait_count = 0;
+ ulint block_lock_oswait_count = 0;
+ mutex_t* block_mutex = NULL;
+ rw_lock_t* block_lock = NULL;
#ifdef UNIV_DEBUG
ulint rw_lock_count= 0;
ulint rw_lock_count_spin_loop= 0;
@@ -8693,12 +9197,16 @@ innodb_mutex_show_status(
mutex_enter(&mutex_list_mutex);
- mutex = UT_LIST_GET_FIRST(mutex_list);
+ for (mutex = UT_LIST_GET_FIRST(mutex_list); mutex != NULL;
+ mutex = UT_LIST_GET_NEXT(list, mutex)) {
+ if (mutex->count_os_wait == 0) {
+ continue;
+ }
- while (mutex != NULL) {
- if (mutex->count_os_wait == 0
- || buf_pool_is_block_mutex(mutex)) {
- goto next_mutex;
+ if (buf_pool_is_block_mutex(mutex)) {
+ block_mutex = mutex;
+ block_mutex_oswait_count += mutex->count_os_wait;
+ continue;
}
#ifdef UNIV_DEBUG
if (mutex->mutex_type != 1) {
@@ -8725,8 +9233,7 @@ innodb_mutex_show_status(
DBUG_RETURN(1);
}
}
- }
- else {
+ } else {
rw_lock_count += mutex->count_using;
rw_lock_count_spin_loop += mutex->count_spin_loop;
rw_lock_count_spin_rounds += mutex->count_spin_rounds;
@@ -8738,7 +9245,7 @@ innodb_mutex_show_status(
buf1len= (uint) my_snprintf(buf1, sizeof(buf1), "%s",
mutex->cmutex_name);
buf2len= (uint) my_snprintf(buf2, sizeof(buf2), "os_waits=%lu",
- mutex->count_os_wait);
+ (ulong) mutex->count_os_wait);
if (stat_print(thd, innobase_hton_name,
hton_name_len, buf1, buf1len,
@@ -8747,45 +9254,81 @@ innodb_mutex_show_status(
DBUG_RETURN(1);
}
#endif /* UNIV_DEBUG */
+ }
+
+ if (block_mutex) {
+ buf1len = (uint) my_snprintf(buf1, sizeof buf1,
+ "combined %s",
+ block_mutex->cmutex_name);
+ buf2len = (uint) my_snprintf(buf2, sizeof buf2,
+ "os_waits=%lu",
+ (ulong) block_mutex_oswait_count);
-next_mutex:
- mutex = UT_LIST_GET_NEXT(list, mutex);
+ if (stat_print(thd, innobase_hton_name,
+ hton_name_len, buf1, buf1len,
+ buf2, buf2len)) {
+ mutex_exit(&mutex_list_mutex);
+ DBUG_RETURN(1);
+ }
}
mutex_exit(&mutex_list_mutex);
mutex_enter(&rw_lock_list_mutex);
- lock = UT_LIST_GET_FIRST(rw_lock_list);
-
- while (lock != NULL) {
- if (lock->count_os_wait
- && !buf_pool_is_block_lock(lock)) {
- buf1len= my_snprintf(buf1, sizeof(buf1), "%s",
- lock->lock_name);
- buf2len= my_snprintf(buf2, sizeof(buf2),
- "os_waits=%lu", lock->count_os_wait);
-
- if (stat_print(thd, innobase_hton_name,
- hton_name_len, buf1, buf1len,
- buf2, buf2len)) {
- mutex_exit(&rw_lock_list_mutex);
- DBUG_RETURN(1);
- }
+ for (lock = UT_LIST_GET_FIRST(rw_lock_list); lock != NULL;
+ lock = UT_LIST_GET_NEXT(list, lock)) {
+ if (lock->count_os_wait == 0) {
+ continue;
+ }
+
+ if (buf_pool_is_block_lock(lock)) {
+ block_lock = lock;
+ block_lock_oswait_count += lock->count_os_wait;
+ continue;
+ }
+
+ buf1len = my_snprintf(buf1, sizeof buf1, "%s",
+ lock->lock_name);
+ buf2len = my_snprintf(buf2, sizeof buf2, "os_waits=%lu",
+ (ulong) lock->count_os_wait);
+
+ if (stat_print(thd, innobase_hton_name,
+ hton_name_len, buf1, buf1len,
+ buf2, buf2len)) {
+ mutex_exit(&rw_lock_list_mutex);
+ DBUG_RETURN(1);
+ }
+ }
+
+ if (block_lock) {
+ buf1len = (uint) my_snprintf(buf1, sizeof buf1,
+ "combined %s",
+ block_lock->lock_name);
+ buf2len = (uint) my_snprintf(buf2, sizeof buf2,
+ "os_waits=%lu",
+ (ulong) block_lock_oswait_count);
+
+ if (stat_print(thd, innobase_hton_name,
+ hton_name_len, buf1, buf1len,
+ buf2, buf2len)) {
+ mutex_exit(&rw_lock_list_mutex);
+ DBUG_RETURN(1);
}
- lock = UT_LIST_GET_NEXT(list, lock);
}
mutex_exit(&rw_lock_list_mutex);
#ifdef UNIV_DEBUG
- buf2len= my_snprintf(buf2, sizeof(buf2),
- "count=%lu, spin_waits=%lu, spin_rounds=%lu, "
- "os_waits=%lu, os_yields=%lu, os_wait_times=%lu",
- rw_lock_count, rw_lock_count_spin_loop,
- rw_lock_count_spin_rounds,
- rw_lock_count_os_wait, rw_lock_count_os_yield,
- (ulong) (rw_lock_wait_time/1000));
+ buf2len = my_snprintf(buf2, sizeof buf2,
+ "count=%lu, spin_waits=%lu, spin_rounds=%lu, "
+ "os_waits=%lu, os_yields=%lu, os_wait_times=%lu",
+ (ulong) rw_lock_count,
+ (ulong) rw_lock_count_spin_loop,
+ (ulong) rw_lock_count_spin_rounds,
+ (ulong) rw_lock_count_os_wait,
+ (ulong) rw_lock_count_os_yield,
+ (ulong) (rw_lock_wait_time / 1000));
if (stat_print(thd, innobase_hton_name, hton_name_len,
STRING_WITH_LEN("rw_lock_mutexes"), buf2, buf2len)) {
@@ -8847,6 +9390,11 @@ static INNOBASE_SHARE* get_share(const char* table_name)
innobase_open_tables, fold, share);
thr_lock_init(&share->lock);
+
+ /* Index translation table initialization */
+ share->idx_trans_tbl.index_mapping = NULL;
+ share->idx_trans_tbl.index_count = 0;
+ share->idx_trans_tbl.array_size = 0;
}
share->use_count++;
@@ -8877,6 +9425,11 @@ static void free_share(INNOBASE_SHARE* share)
HASH_DELETE(INNOBASE_SHARE, table_name_hash,
innobase_open_tables, fold, share);
thr_lock_delete(&share->lock);
+
+ /* Free any memory from index translation table */
+ my_free(share->idx_trans_tbl.index_mapping,
+ MYF(MY_ALLOW_ZERO_PTR));
+
my_free(share, MYF(0));
/* TODO: invoke HASH_MIGRATE if innobase_open_tables
@@ -8979,7 +9532,7 @@ ha_innobase::store_lock(
isolation_level = trx->isolation_level;
if ((srv_locks_unsafe_for_binlog
- || isolation_level == TRX_ISO_READ_COMMITTED)
+ || isolation_level <= TRX_ISO_READ_COMMITTED)
&& isolation_level != TRX_ISO_SERIALIZABLE
&& (lock_type == TL_READ || lock_type == TL_READ_NO_INSERT)
&& (sql_command == SQLCOM_INSERT_SELECT
@@ -9111,7 +9664,10 @@ ha_innobase::innobase_get_autoinc(
*value = dict_table_autoinc_read(prebuilt->table);
/* It should have been initialized during open. */
- ut_a(*value != 0);
+ if (*value == 0) {
+ prebuilt->autoinc_error = DB_UNSUPPORTED;
+ dict_table_autoinc_unlock(prebuilt->table);
+ }
}
return(prebuilt->autoinc_error);
@@ -9727,33 +10283,60 @@ innobase_set_cursor_view(
(cursor_view_t*) curview);
}
+/*******************************************************************//**
+If col_name is not NULL, check whether the named column is being
+renamed in the table. If col_name is not provided, check
+whether any one of columns in the table is being renamed.
+@return true if the column is being renamed */
+static
+bool
+check_column_being_renamed(
+/*=======================*/
+ const TABLE* table, /*!< in: MySQL table */
+ const char* col_name) /*!< in: name of the column */
+{
+ uint k;
+ Field* field;
+
+ for (k = 0; k < table->s->fields; k++) {
+ field = table->field[k];
-/***********************************************************************
-Check whether any of the given columns is being renamed in the table. */
+ if (field->flags & FIELD_IS_RENAMED) {
+
+ /* If col_name is not provided, return
+ if the field is marked as being renamed. */
+ if (!col_name) {
+ return(true);
+ }
+
+ /* If col_name is provided, return only
+ if names match */
+ if (innobase_strcasecmp(field->field_name,
+ col_name) == 0) {
+ return(true);
+ }
+ }
+ }
+
+ return(false);
+}
+
+/*******************************************************************//**
+Check whether any of the given columns is being renamed in the table.
+@return true if any of col_names is being renamed in table */
static
bool
column_is_being_renamed(
/*====================*/
- /* out: true if any of col_names is
- being renamed in table */
- TABLE* table, /* in: MySQL table */
- uint n_cols, /* in: number of columns */
- const char** col_names) /* in: names of the columns */
+ TABLE* table, /*!< in: MySQL table */
+ uint n_cols, /*!< in: number of columns */
+ const char** col_names) /*!< in: names of the columns */
{
uint j;
- uint k;
- Field* field;
- const char* col_name;
for (j = 0; j < n_cols; j++) {
- col_name = col_names[j];
- for (k = 0; k < table->s->fields; k++) {
- field = table->field[k];
- if ((field->flags & FIELD_IS_RENAMED)
- && innobase_strcasecmp(field->field_name,
- col_name) == 0) {
- return(true);
- }
+ if (check_column_being_renamed(table, col_names[j])) {
+ return(true);
}
}
@@ -9844,18 +10427,13 @@ ha_innobase::check_if_incompatible_data(
DBUG_RETURN(COMPATIBLE_DATA_NO);
}
- /* Renaming column asynchronizes dictionary between mysqld and InnoDB...
- If not synchronized, treat as COMPATIBLE_DATA_NO
- until the bug http://bugs.mysql.com/47621 is fixed officialily */
- {
- uint i;
- for (i = 0; i < table->s->fields; i++) {
- if (table->field[i]->flags & FIELD_IN_ADD_INDEX
- && innobase_strcasecmp(table->field[i]->field_name,
- dict_table_get_col_name(prebuilt->table, i))) {
- DBUG_RETURN(COMPATIBLE_DATA_NO);
- }
- }
+ /* For column rename operation, MySQL does not supply enough
+ information (new column name etc.) for InnoDB to make appropriate
+ system metadata change. To avoid system metadata inconsistency,
+ currently we can just request a table rebuild/copy by returning
+ COMPATIBLE_DATA_NO */
+ if (check_column_being_renamed(table, NULL)) {
+ DBUG_RETURN(COMPATIBLE_DATA_NO);
}
/* Check if a column participating in a foreign key is being renamed.
@@ -10385,8 +10963,8 @@ static MYSQL_SYSVAR_BOOL(extra_undoslots, innobase_extra_undoslots,
static MYSQL_SYSVAR_BOOL(fast_recovery, innobase_fast_recovery,
PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
- "Enable to use speed hack of recovery avoiding flush list sorting.",
- NULL, NULL, TRUE);
+ "obsolete option. affects nothing.",
+ NULL, NULL, FALSE);
static MYSQL_SYSVAR_BOOL(recovery_stats, innobase_recovery_stats,
PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
@@ -10679,6 +11257,11 @@ static MYSQL_SYSVAR_STR(data_file_path, innobase_data_file_path,
"Path to individual files and their sizes.",
NULL, NULL, NULL);
+static MYSQL_SYSVAR_STR(doublewrite_file, innobase_doublewrite_file,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+ "Path to special datafile for doublewrite buffer. (default is "": not used) ### ONLY FOR EXPERTS!!! ###",
+ NULL, NULL, NULL);
+
static MYSQL_SYSVAR_LONG(autoinc_lock_mode, innobase_autoinc_lock_mode,
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
"The AUTOINC lock modes supported by InnoDB: "
@@ -10703,13 +11286,13 @@ static MYSQL_SYSVAR_BOOL(use_sys_malloc, srv_use_sys_malloc,
static MYSQL_SYSVAR_STR(change_buffering, innobase_change_buffering,
PLUGIN_VAR_RQCMDARG,
"Buffer changes to reduce random access: "
- "OFF, ON, inserting, deleting, changing, or purging.",
+ "OFF, ON, none, inserts.",
innodb_change_buffering_validate,
innodb_change_buffering_update, NULL);
static MYSQL_SYSVAR_ULONG(read_ahead_threshold, srv_read_ahead_threshold,
PLUGIN_VAR_RQCMDARG,
- "Number of pages that must be accessed sequentially for InnoDB to"
+ "Number of pages that must be accessed sequentially for InnoDB to "
"trigger a readahead.",
NULL, NULL, 56, 0, 64, 0);
@@ -10822,11 +11405,6 @@ static MYSQL_SYSVAR_ULONG(dict_size_limit, srv_dict_size_limit,
"Limit the allocated memory for dictionary cache. (0: unlimited)",
NULL, NULL, 0, 0, LONG_MAX, 0);
-static MYSQL_SYSVAR_ULINT(relax_table_creation, srv_relax_table_creation,
- PLUGIN_VAR_RQCMDARG,
- "Relax limitation of column size at table creation as builtin InnoDB.",
- NULL, NULL, 0, 0, 1, 0);
-
static MYSQL_SYSVAR_ULINT(pass_corrupt_table, srv_pass_corrupt_table,
PLUGIN_VAR_RQCMDARG,
"Pass corruptions of user tables as 'corrupt table' instead of not crashing itself, "
@@ -10845,6 +11423,7 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
MYSQL_SYSVAR(commit_concurrency),
MYSQL_SYSVAR(concurrency_tickets),
MYSQL_SYSVAR(data_file_path),
+ MYSQL_SYSVAR(doublewrite_file),
MYSQL_SYSVAR(data_home_dir),
MYSQL_SYSVAR(doublewrite),
MYSQL_SYSVAR(extra_undoslots),
@@ -10915,7 +11494,6 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
MYSQL_SYSVAR(read_ahead_threshold),
MYSQL_SYSVAR(io_capacity),
MYSQL_SYSVAR(use_purge_thread),
- MYSQL_SYSVAR(relax_table_creation),
MYSQL_SYSVAR(pass_corrupt_table),
NULL
};
diff --git a/storage/xtradb/handler/ha_innodb.h b/storage/xtradb/handler/ha_innodb.h
index 52fc54fde8c..729abcbcd0a 100644
--- a/storage/xtradb/handler/ha_innodb.h
+++ b/storage/xtradb/handler/ha_innodb.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 2000, 2009, MySQL AB & Innobase Oy. All Rights Reserved.
+Copyright (c) 2000, 2010, MySQL AB & Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -27,16 +27,32 @@ Place, Suite 330, Boston, MA 02111-1307 USA
#pragma interface /* gcc class implementation */
#endif
+/* Structure defines translation table between mysql index and innodb
+index structures */
+typedef struct innodb_idx_translate_struct {
+ ulint index_count; /*!< number of valid index entries
+ in the index_mapping array */
+ ulint array_size; /*!< array size of index_mapping */
+ dict_index_t** index_mapping; /*!< index pointer array directly
+ maps to index in Innodb from MySQL
+ array index */
+} innodb_idx_translate_t;
+
+
/** InnoDB table share */
typedef struct st_innobase_share {
- THR_LOCK lock; /*!< MySQL lock protecting
- this structure */
- const char* table_name; /*!< InnoDB table name */
- uint use_count; /*!< reference count,
- incremented in get_share()
- and decremented in free_share() */
- void* table_name_hash;/*!< hash table chain node */
- dict_table_t* ib_table;
+ THR_LOCK lock; /*!< MySQL lock protecting
+ this structure */
+ const char* table_name; /*!< InnoDB table name */
+ uint use_count; /*!< reference count,
+ incremented in get_share()
+ and decremented in
+ free_share() */
+ void* table_name_hash;/*!< hash table chain node */
+ innodb_idx_translate_t idx_trans_tbl; /*!< index translation
+ table between MySQL and
+ Innodb */
+ dict_table_t* ib_table;
} INNOBASE_SHARE;
@@ -92,9 +108,8 @@ class ha_innobase: public handler
ulint innobase_reset_autoinc(ulonglong auto_inc);
ulint innobase_get_autoinc(ulonglong* value);
ulint innobase_update_autoinc(ulonglong auto_inc);
- ulint innobase_initialize_autoinc();
+ void innobase_initialize_autoinc();
dict_index_t* innobase_get_index(uint keynr);
- ulonglong innobase_get_int_col_max_value(const Field* field);
/* Init values for the class: */
public:
diff --git a/storage/xtradb/handler/handler0alter.cc b/storage/xtradb/handler/handler0alter.cc
index e060d88b3b8..2eb9c9f474c 100644
--- a/storage/xtradb/handler/handler0alter.cc
+++ b/storage/xtradb/handler/handler0alter.cc
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 2005, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2005, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -229,9 +229,11 @@ static
int
innobase_check_index_keys(
/*======================*/
- const KEY* key_info, /*!< in: Indexes to be created */
- ulint num_of_keys) /*!< in: Number of indexes to
- be created */
+ const KEY* key_info, /*!< in: Indexes to be
+ created */
+ ulint num_of_keys, /*!< in: Number of
+ indexes to be created */
+ const dict_table_t* table) /*!< in: Existing indexes */
{
ulint key_num;
@@ -248,9 +250,22 @@ innobase_check_index_keys(
const KEY& key2 = key_info[i];
if (0 == strcmp(key.name, key2.name)) {
- sql_print_error("InnoDB: key name `%s` appears"
- " twice in CREATE INDEX\n",
- key.name);
+ my_error(ER_WRONG_NAME_FOR_INDEX, MYF(0),
+ key.name);
+
+ return(ER_WRONG_NAME_FOR_INDEX);
+ }
+ }
+
+ /* Check that the same index name does not already exist. */
+
+ for (const dict_index_t* index
+ = dict_table_get_first_index(table);
+ index; index = dict_table_get_next_index(index)) {
+
+ if (0 == strcmp(key.name, index->name)) {
+ my_error(ER_WRONG_NAME_FOR_INDEX, MYF(0),
+ key.name);
return(ER_WRONG_NAME_FOR_INDEX);
}
@@ -258,7 +273,7 @@ innobase_check_index_keys(
/* Check that MySQL does not try to create a column
prefix index field on an inappropriate data type and
- that the same colum does not appear twice in the index. */
+ that the same column does not appear twice in the index. */
for (ulint i = 0; i < key.key_parts; i++) {
const KEY_PART_INFO& key_part1
@@ -289,14 +304,8 @@ innobase_check_index_keys(
}
}
- sql_print_error("InnoDB: MySQL is trying to"
- " create a column prefix"
- " index field on an"
- " inappropriate data type."
- " column `%s`,"
- " index `%s`.\n",
- field->field_name,
- key.name);
+ my_error(ER_WRONG_KEY_COLUMN, MYF(0),
+ field->field_name);
return(ER_WRONG_KEY_COLUMN);
}
@@ -309,11 +318,8 @@ innobase_check_index_keys(
continue;
}
- sql_print_error("InnoDB: column `%s`"
- " is not allowed to occur"
- " twice in index `%s`.\n",
- key_part1.field->field_name,
- key.name);
+ my_error(ER_WRONG_KEY_COLUMN, MYF(0),
+ key_part1.field->field_name);
return(ER_WRONG_KEY_COLUMN);
}
}
@@ -522,12 +528,14 @@ innobase_create_key_def(
key_info->name, "PRIMARY");
/* If there is a UNIQUE INDEX consisting entirely of NOT NULL
- columns, MySQL will treat it as a PRIMARY KEY unless the
- table already has one. */
+ columns and if the index does not contain column prefix(es)
+ (only prefix/part of the column is indexed), MySQL will treat the
+ index as a PRIMARY KEY unless the table already has one. */
if (!new_primary && (key_info->flags & HA_NOSAME)
+ && (!(key_info->flags & HA_KEY_HAS_PART_KEY_SEG))
&& row_table_got_default_clust_index(table)) {
- uint key_part = key_info->key_parts;
+ uint key_part = key_info->key_parts;
new_primary = TRUE;
@@ -656,12 +664,18 @@ ha_innobase::add_index(
innodb_table = indexed_table
= dict_table_get(prebuilt->table->name, FALSE);
+ if (UNIV_UNLIKELY(!innodb_table)) {
+ error = HA_ERR_NO_SUCH_TABLE;
+ goto err_exit;
+ }
+
/* Check if the index name is reserved. */
if (innobase_index_name_is_reserved(trx, key_info, num_of_keys)) {
error = ER_WRONG_NAME_FOR_INDEX;
} else {
/* Check that index keys are sensible */
- error = innobase_check_index_keys(key_info, num_of_keys);
+ error = innobase_check_index_keys(key_info, num_of_keys,
+ innodb_table);
}
if (UNIV_UNLIKELY(error)) {
@@ -708,6 +722,8 @@ err_exit:
row_mysql_lock_data_dictionary(trx);
dict_locked = TRUE;
+ ut_d(dict_table_check_for_dup_indexes(innodb_table, FALSE));
+
/* If a new primary key is defined for the table we need
to drop the original table and rebuild all indexes. */
@@ -740,6 +756,8 @@ err_exit:
user_thd);
}
+ ut_d(dict_table_check_for_dup_indexes(innodb_table,
+ FALSE));
row_mysql_unlock_data_dictionary(trx);
goto err_exit;
}
@@ -764,6 +782,10 @@ err_exit:
ut_ad(error == DB_SUCCESS);
+ /* We will need to rebuild index translation table. Set
+ valid index entry count in the translation table to zero */
+ share->idx_trans_tbl.index_count = 0;
+
/* Commit the data dictionary transaction in order to release
the table locks on the system tables. This means that if
MySQL crashes while creating a new primary key inside
@@ -799,18 +821,6 @@ err_exit:
index, num_of_idx, table);
error_handling:
-#ifdef UNIV_DEBUG
- /* TODO: At the moment we can't handle the following statement
- in our debugging code below:
-
- alter table t drop index b, add index (b);
-
- The fix will have to parse the SQL and note that the index
- being added has the same name as the one being dropped and
- ignore that in the dup index check.*/
- //dict_table_check_for_dup_indexes(prebuilt->table);
-#endif
-
/* After an error, remove all those index definitions from the
dictionary which were defined. */
@@ -822,6 +832,8 @@ error_handling:
row_mysql_lock_data_dictionary(trx);
dict_locked = TRUE;
+ ut_d(dict_table_check_for_dup_indexes(prebuilt->table, TRUE));
+
if (!new_primary) {
error = row_merge_rename_indexes(trx, indexed_table);
@@ -909,6 +921,7 @@ convert_error:
}
if (dict_locked) {
+ ut_d(dict_table_check_for_dup_indexes(innodb_table, FALSE));
row_mysql_unlock_data_dictionary(trx);
}
@@ -951,6 +964,7 @@ ha_innobase::prepare_drop_index(
/* Test and mark all the indexes to be dropped */
row_mysql_lock_data_dictionary(trx);
+ ut_d(dict_table_check_for_dup_indexes(prebuilt->table, FALSE));
/* Check that none of the indexes have previously been flagged
for deletion. */
@@ -1116,6 +1130,7 @@ func_exit:
} while (index);
}
+ ut_d(dict_table_check_for_dup_indexes(prebuilt->table, FALSE));
row_mysql_unlock_data_dictionary(trx);
DBUG_RETURN(err);
@@ -1162,6 +1177,7 @@ ha_innobase::final_drop_index(
prebuilt->table->flags, user_thd);
row_mysql_lock_data_dictionary(trx);
+ ut_d(dict_table_check_for_dup_indexes(prebuilt->table, FALSE));
if (UNIV_UNLIKELY(err)) {
@@ -1198,11 +1214,12 @@ ha_innobase::final_drop_index(
ut_a(!index->to_be_dropped);
}
-#ifdef UNIV_DEBUG
- dict_table_check_for_dup_indexes(prebuilt->table);
-#endif
+ /* We will need to rebuild index translation table. Set
+ valid index entry count in the translation table to zero */
+ share->idx_trans_tbl.index_count = 0;
func_exit:
+ ut_d(dict_table_check_for_dup_indexes(prebuilt->table, FALSE));
trx_commit_for_mysql(trx);
trx_commit_for_mysql(prebuilt->trx);
row_mysql_unlock_data_dictionary(trx);
diff --git a/storage/xtradb/handler/i_s.cc b/storage/xtradb/handler/i_s.cc
index fa76eec6249..c48676c2892 100644
--- a/storage/xtradb/handler/i_s.cc
+++ b/storage/xtradb/handler/i_s.cc
@@ -2900,6 +2900,9 @@ i_s_innodb_index_stats_fill(
i_s_table->field[3]->store(index->n_uniq);
row_per_keys[0] = '\0';
+
+ /* It is remained optimistic operation still for now */
+ //dict_index_stat_mutex_enter(index);
if (index->stat_n_diff_key_vals) {
for (i = 1; i <= index->n_uniq; i++) {
ib_int64_t rec_per_key;
@@ -2913,6 +2916,8 @@ i_s_innodb_index_stats_fill(
strncat(row_per_keys, buff, 256 - strlen(row_per_keys));
}
}
+ //dict_index_stat_mutex_exit(index);
+
field_store_string(i_s_table->field[4], row_per_keys);
i_s_table->field[5]->store(index->stat_index_size);
@@ -3168,6 +3173,14 @@ UNIV_INTERN struct st_mysql_plugin i_s_innodb_admin_command =
static ST_FIELD_INFO i_s_innodb_sys_tables_info[] =
{
+ {STRUCT_FLD(field_name, "SCHEMA"),
+ STRUCT_FLD(field_length, NAME_LEN),
+ STRUCT_FLD(field_type, MYSQL_TYPE_STRING),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, 0),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
{STRUCT_FLD(field_name, "NAME"),
STRUCT_FLD(field_length, NAME_LEN),
STRUCT_FLD(field_type, MYSQL_TYPE_STRING),
@@ -3326,6 +3339,54 @@ copy_string_field(
static
int
+copy_name_fields(
+/*=============*/
+ TABLE* table,
+ int table_field_1,
+ const rec_t* rec,
+ int rec_field)
+{
+ int status;
+ const byte* data;
+ ulint len;
+
+ data = rec_get_nth_field_old(rec, rec_field, &len);
+ if (len == UNIV_SQL_NULL) {
+ table->field[table_field_1]->set_null();
+ table->field[table_field_1 + 1]->set_null();
+ status = 0; /* success */
+ } else {
+ char buf[NAME_LEN * 2 + 2];
+ char* ptr;
+
+ if (len > NAME_LEN * 2 + 1) {
+ table->field[table_field_1]->set_null();
+ status = field_store_string(table->field[table_field_1 + 1],
+ "###TOO LONG NAME###");
+ goto end_func;
+ }
+
+ strncpy(buf, (char*)data, len);
+ buf[len] = '\0';
+ ptr = strchr(buf, '/');
+ if (ptr) {
+ *ptr = '\0';
+ ++ptr;
+
+ status = field_store_string(table->field[table_field_1], buf);
+ status |= field_store_string(table->field[table_field_1 + 1], ptr);
+ } else {
+ table->field[table_field_1]->set_null();
+ status = field_store_string(table->field[table_field_1 + 1], buf);
+ }
+ }
+
+end_func:
+ return status;
+}
+
+static
+int
copy_int_field(
/*===========*/
TABLE* table,
@@ -3394,49 +3455,49 @@ copy_sys_tables_rec(
/* NAME */
field = dict_index_get_nth_col_pos(index, 0);
- status = copy_string_field(table, 0, rec, field);
+ status = copy_name_fields(table, 0, rec, field);
if (status) {
return status;
}
/* ID */
field = dict_index_get_nth_col_pos(index, 1);
- status = copy_id_field(table, 1, rec, field);
+ status = copy_id_field(table, 2, rec, field);
if (status) {
return status;
}
/* N_COLS */
field = dict_index_get_nth_col_pos(index, 2);
- status = copy_int_field(table, 2, rec, field);
+ status = copy_int_field(table, 3, rec, field);
if (status) {
return status;
}
/* TYPE */
field = dict_index_get_nth_col_pos(index, 3);
- status = copy_int_field(table, 3, rec, field);
+ status = copy_int_field(table, 4, rec, field);
if (status) {
return status;
}
/* MIX_ID */
field = dict_index_get_nth_col_pos(index, 4);
- status = copy_id_field(table, 4, rec, field);
+ status = copy_id_field(table, 5, rec, field);
if (status) {
return status;
}
/* MIX_LEN */
field = dict_index_get_nth_col_pos(index, 5);
- status = copy_int_field(table, 5, rec, field);
+ status = copy_int_field(table, 6, rec, field);
if (status) {
return status;
}
/* CLUSTER_NAME */
field = dict_index_get_nth_col_pos(index, 6);
- status = copy_string_field(table, 6, rec, field);
+ status = copy_string_field(table, 7, rec, field);
if (status) {
return status;
}
/* SPACE */
field = dict_index_get_nth_col_pos(index, 7);
- status = copy_int_field(table, 7, rec, field);
+ status = copy_int_field(table, 8, rec, field);
if (status) {
return status;
}
diff --git a/storage/xtradb/handler/innodb_patch_info.h b/storage/xtradb/handler/innodb_patch_info.h
index 5fa50129a04..38b97411340 100644
--- a/storage/xtradb/handler/innodb_patch_info.h
+++ b/storage/xtradb/handler/innodb_patch_info.h
@@ -41,8 +41,8 @@ struct innodb_enhancement {
{"innodb_admin_command_base","XtraDB specific command interface through i_s","","http://www.percona.com/docs/wiki/percona-xtradb"},
{"innodb_show_lock_name","Show mutex/lock name instead of crated file/line","","http://www.percona.com/docs/wiki/percona-xtradb"},
{"innodb_extend_slow","Extended statistics in slow.log","It is InnoDB-part only. It needs to patch also to mysqld.","http://www.percona.com/docs/wiki/percona-xtradb"},
-{"innodb_relax_table_creation","Relax limitation of column size at table creation as builtin InnoDB.","","http://www.percona.com/docs/wiki/percona-xtradb"},
{"innodb_lru_dump_restore","Dump and restore command for content of buffer pool","","http://www.percona.com/docs/wiki/percona-xtradb"},
+{"innodb_separate_doublewrite","Add option 'innodb_doublewrite_file' to separate doublewrite dedicated tablespace","","http://www.percona.com/docs/wiki/percona-xtradb"},
{"innodb_pass_corrupt_table","Treat tables as corrupt instead of crash, when meet corrupt blocks","","http://www.percona.com/docs/wiki/percona-xtradb"},
{"innodb_fast_checksum","Using the checksum on 32bit-unit calculation","incompatible for unpatched ver.","http://www.percona.com/docs/wiki/percona-xtradb"},
{"innodb_files_extend","allow >4GB transaction log files, and can vary universal page size of datafiles","incompatible for unpatched ver.","http://www.percona.com/docs/wiki/percona-xtradb"},
diff --git a/storage/xtradb/ibuf/ibuf0ibuf.c b/storage/xtradb/ibuf/ibuf0ibuf.c
index 94f3751dd04..e01c2d6b800 100644
--- a/storage/xtradb/ibuf/ibuf0ibuf.c
+++ b/storage/xtradb/ibuf/ibuf0ibuf.c
@@ -733,24 +733,41 @@ page containing the descriptor bits for the file page; the bitmap page
is x-latched */
static
page_t*
-ibuf_bitmap_get_map_page(
-/*=====================*/
- ulint space, /*!< in: space id of the file page */
- ulint page_no,/*!< in: page number of the file page */
- ulint zip_size,/*!< in: compressed page size in bytes;
- 0 for uncompressed pages */
- mtr_t* mtr) /*!< in: mtr */
+ibuf_bitmap_get_map_page_func(
+/*==========================*/
+ ulint space, /*!< in: space id of the file page */
+ ulint page_no,/*!< in: page number of the file page */
+ ulint zip_size,/*!< in: compressed page size in bytes;
+ 0 for uncompressed pages */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
+ mtr_t* mtr) /*!< in: mtr */
{
buf_block_t* block;
- block = buf_page_get(space, zip_size,
- ibuf_bitmap_page_no_calc(zip_size, page_no),
- RW_X_LATCH, mtr);
+ block = buf_page_get_gen(space, zip_size,
+ ibuf_bitmap_page_no_calc(zip_size, page_no),
+ RW_X_LATCH, NULL, BUF_GET,
+ file, line, mtr);
buf_block_dbg_add_level(block, SYNC_IBUF_BITMAP);
return(buf_block_get_frame(block));
}
+/********************************************************************//**
+Gets the ibuf bitmap page where the bits describing a given file page are
+stored.
+@return bitmap page where the file page is mapped, that is, the bitmap
+page containing the descriptor bits for the file page; the bitmap page
+is x-latched
+@param space in: space id of the file page
+@param page_no in: page number of the file page
+@param zip_size in: compressed page size in bytes; 0 for uncompressed pages
+@param mtr in: mini-transaction */
+#define ibuf_bitmap_get_map_page(space, page_no, zip_size, mtr) \
+ ibuf_bitmap_get_map_page_func(space, page_no, zip_size, \
+ __FILE__, __LINE__, mtr)
+
/************************************************************************//**
Sets the free bits of the page in the ibuf bitmap. This is done in a separate
mini-transaction, hence this operation does not restrict further work to only
diff --git a/storage/xtradb/include/btr0btr.h b/storage/xtradb/include/btr0btr.h
index d5c8258513c..5e6a76c7d21 100644
--- a/storage/xtradb/include/btr0btr.h
+++ b/storage/xtradb/include/btr0btr.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -193,6 +193,10 @@ btr_leaf_page_release(
mtr_t* mtr); /*!< in: mtr */
/**************************************************************//**
Gets the child node file address in a node pointer.
+NOTE: the offsets array must contain all offsets for the record since
+we read the last field according to offsets and assume that it contains
+the child page number. In other words offsets must have been retrieved
+with rec_get_offsets(n_fields=ULINT_UNDEFINED).
@return child node address */
UNIV_INLINE
ulint
@@ -317,12 +321,16 @@ Inserts a data tuple to a tree on a non-leaf level. It is assumed
that mtr holds an x-latch on the tree. */
UNIV_INTERN
void
-btr_insert_on_non_leaf_level(
-/*=========================*/
+btr_insert_on_non_leaf_level_func(
+/*==============================*/
dict_index_t* index, /*!< in: index */
ulint level, /*!< in: level, must be > 0 */
dtuple_t* tuple, /*!< in: the record to be inserted */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
mtr_t* mtr); /*!< in: mtr */
+# define btr_insert_on_non_leaf_level(i,l,t,m) \
+ btr_insert_on_non_leaf_level_func(i,l,t,__FILE__,__LINE__,m)
#endif /* !UNIV_HOTBACKUP */
/****************************************************************//**
Sets a record as the predefined minimum record. */
diff --git a/storage/xtradb/include/btr0btr.ic b/storage/xtradb/include/btr0btr.ic
index 4aa4a41f50d..c9c38f3c3b3 100644
--- a/storage/xtradb/include/btr0btr.ic
+++ b/storage/xtradb/include/btr0btr.ic
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -257,6 +257,10 @@ btr_page_set_prev(
/**************************************************************//**
Gets the child node file address in a node pointer.
+NOTE: the offsets array must contain all offsets for the record since
+we read the last field according to offsets and assume that it contains
+the child page number. In other words offsets must have been retrieved
+with rec_get_offsets(n_fields=ULINT_UNDEFINED).
@return child node address */
UNIV_INLINE
ulint
diff --git a/storage/xtradb/include/btr0cur.h b/storage/xtradb/include/btr0cur.h
index 480a3877e54..716f15c4267 100644
--- a/storage/xtradb/include/btr0cur.h
+++ b/storage/xtradb/include/btr0cur.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -152,29 +152,39 @@ btr_cur_search_to_nth_level(
ulint has_search_latch,/*!< in: latch mode the caller
currently has on btr_search_latch:
RW_S_LATCH, or 0 */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
mtr_t* mtr); /*!< in: mtr */
/*****************************************************************//**
Opens a cursor at either end of an index. */
UNIV_INTERN
void
-btr_cur_open_at_index_side(
-/*=======================*/
+btr_cur_open_at_index_side_func(
+/*============================*/
ibool from_left, /*!< in: TRUE if open to the low end,
FALSE if to the high end */
dict_index_t* index, /*!< in: index */
ulint latch_mode, /*!< in: latch mode */
btr_cur_t* cursor, /*!< in: cursor */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
mtr_t* mtr); /*!< in: mtr */
+#define btr_cur_open_at_index_side(f,i,l,c,m) \
+ btr_cur_open_at_index_side_func(f,i,l,c,__FILE__,__LINE__,m)
/**********************************************************************//**
Positions a cursor at a randomly chosen position within a B-tree. */
UNIV_INTERN
void
-btr_cur_open_at_rnd_pos(
-/*====================*/
+btr_cur_open_at_rnd_pos_func(
+/*=========================*/
dict_index_t* index, /*!< in: index */
ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */
btr_cur_t* cursor, /*!< in/out: B-tree cursor */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
mtr_t* mtr); /*!< in: mtr */
+#define btr_cur_open_at_rnd_pos(i,l,c,m) \
+ btr_cur_open_at_rnd_pos_func(i,l,c,__FILE__,__LINE__,m)
/*************************************************************//**
Tries to perform an insert to a page in an index tree, next to cursor.
It is assumed that mtr holds an x-latch on the page. The operation does
diff --git a/storage/xtradb/include/btr0pcur.h b/storage/xtradb/include/btr0pcur.h
index 12b1375d8b7..2334a266280 100644
--- a/storage/xtradb/include/btr0pcur.h
+++ b/storage/xtradb/include/btr0pcur.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -82,8 +82,8 @@ Initializes and opens a persistent cursor to an index tree. It should be
closed with btr_pcur_close. */
UNIV_INLINE
void
-btr_pcur_open(
-/*==========*/
+btr_pcur_open_func(
+/*===============*/
dict_index_t* index, /*!< in: index */
const dtuple_t* tuple, /*!< in: tuple on which search done */
ulint mode, /*!< in: PAGE_CUR_L, ...;
@@ -94,14 +94,18 @@ btr_pcur_open(
record! */
ulint latch_mode,/*!< in: BTR_SEARCH_LEAF, ... */
btr_pcur_t* cursor, /*!< in: memory buffer for persistent cursor */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
mtr_t* mtr); /*!< in: mtr */
+#define btr_pcur_open(i,t,md,l,c,m) \
+ btr_pcur_open_func(i,t,md,l,c,__FILE__,__LINE__,m)
/**************************************************************//**
Opens an persistent cursor to an index tree without initializing the
cursor. */
UNIV_INLINE
void
-btr_pcur_open_with_no_init(
-/*=======================*/
+btr_pcur_open_with_no_init_func(
+/*============================*/
dict_index_t* index, /*!< in: index */
const dtuple_t* tuple, /*!< in: tuple on which search done */
ulint mode, /*!< in: PAGE_CUR_L, ...;
@@ -119,7 +123,12 @@ btr_pcur_open_with_no_init(
ulint has_search_latch,/*!< in: latch mode the caller
currently has on btr_search_latch:
RW_S_LATCH, or 0 */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
mtr_t* mtr); /*!< in: mtr */
+#define btr_pcur_open_with_no_init(ix,t,md,l,cur,has,m) \
+ btr_pcur_open_with_no_init_func(ix,t,md,l,cur,has,__FILE__,__LINE__,m)
+
/*****************************************************************//**
Opens a persistent cursor at either end of an index. */
UNIV_INLINE
@@ -160,8 +169,8 @@ before first in tree. The latching mode must be BTR_SEARCH_LEAF or
BTR_MODIFY_LEAF. */
UNIV_INTERN
void
-btr_pcur_open_on_user_rec(
-/*======================*/
+btr_pcur_open_on_user_rec_func(
+/*===========================*/
dict_index_t* index, /*!< in: index */
const dtuple_t* tuple, /*!< in: tuple on which search done */
ulint mode, /*!< in: PAGE_CUR_L, ... */
@@ -169,17 +178,25 @@ btr_pcur_open_on_user_rec(
BTR_MODIFY_LEAF */
btr_pcur_t* cursor, /*!< in: memory buffer for persistent
cursor */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
mtr_t* mtr); /*!< in: mtr */
+#define btr_pcur_open_on_user_rec(i,t,md,l,c,m) \
+ btr_pcur_open_on_user_rec_func(i,t,md,l,c,__FILE__,__LINE__,m)
/**********************************************************************//**
Positions a cursor at a randomly chosen position within a B-tree. */
UNIV_INLINE
void
-btr_pcur_open_at_rnd_pos(
-/*=====================*/
+btr_pcur_open_at_rnd_pos_func(
+/*==========================*/
dict_index_t* index, /*!< in: index */
ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */
btr_pcur_t* cursor, /*!< in/out: B-tree pcur */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
mtr_t* mtr); /*!< in: mtr */
+#define btr_pcur_open_at_rnd_pos(i,l,c,m) \
+ btr_pcur_open_at_rnd_pos_func(i,l,c,__FILE__,__LINE__,m)
/**************************************************************//**
Frees the possible old_rec_buf buffer of a persistent cursor and sets the
latch mode of the persistent cursor to BTR_NO_LATCHES. */
@@ -218,11 +235,15 @@ record and it can be restored on a user record whose ordering fields
are identical to the ones of the original user record */
UNIV_INTERN
ibool
-btr_pcur_restore_position(
-/*======================*/
+btr_pcur_restore_position_func(
+/*===========================*/
ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */
btr_pcur_t* cursor, /*!< in: detached persistent cursor */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
mtr_t* mtr); /*!< in: mtr */
+#define btr_pcur_restore_position(l,cur,mtr) \
+ btr_pcur_restore_position_func(l,cur,__FILE__,__LINE__,mtr)
/**************************************************************//**
If the latch mode of the cursor is BTR_LEAF_SEARCH or BTR_LEAF_MODIFY,
releases the page latch and bufferfix reserved by the cursor.
@@ -260,20 +281,13 @@ btr_pcur_get_mtr(
/*=============*/
btr_pcur_t* cursor); /*!< in: persistent cursor */
/**************************************************************//**
-Commits the pcur mtr and sets the pcur latch mode to BTR_NO_LATCHES,
+Commits the mtr and sets the pcur latch mode to BTR_NO_LATCHES,
that is, the cursor becomes detached. If there have been modifications
to the page where pcur is positioned, this can be used instead of
btr_pcur_release_leaf. Function btr_pcur_store_position should be used
before calling this, if restoration of cursor is wanted later. */
UNIV_INLINE
void
-btr_pcur_commit(
-/*============*/
- btr_pcur_t* pcur); /*!< in: persistent cursor */
-/**************************************************************//**
-Differs from btr_pcur_commit in that we can specify the mtr to commit. */
-UNIV_INLINE
-void
btr_pcur_commit_specify_mtr(
/*========================*/
btr_pcur_t* pcur, /*!< in: persistent cursor */
diff --git a/storage/xtradb/include/btr0pcur.ic b/storage/xtradb/include/btr0pcur.ic
index 0ca7223f861..0c38797e6c5 100644
--- a/storage/xtradb/include/btr0pcur.ic
+++ b/storage/xtradb/include/btr0pcur.ic
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -395,30 +395,13 @@ btr_pcur_move_to_next(
}
/**************************************************************//**
-Commits the pcur mtr and sets the pcur latch mode to BTR_NO_LATCHES,
+Commits the mtr and sets the pcur latch mode to BTR_NO_LATCHES,
that is, the cursor becomes detached. If there have been modifications
to the page where pcur is positioned, this can be used instead of
btr_pcur_release_leaf. Function btr_pcur_store_position should be used
before calling this, if restoration of cursor is wanted later. */
UNIV_INLINE
void
-btr_pcur_commit(
-/*============*/
- btr_pcur_t* pcur) /*!< in: persistent cursor */
-{
- ut_a(pcur->pos_state == BTR_PCUR_IS_POSITIONED);
-
- pcur->latch_mode = BTR_NO_LATCHES;
-
- mtr_commit(pcur->mtr);
-
- pcur->pos_state = BTR_PCUR_WAS_POSITIONED;
-}
-
-/**************************************************************//**
-Differs from btr_pcur_commit in that we can specify the mtr to commit. */
-UNIV_INLINE
-void
btr_pcur_commit_specify_mtr(
/*========================*/
btr_pcur_t* pcur, /*!< in: persistent cursor */
@@ -483,8 +466,8 @@ Initializes and opens a persistent cursor to an index tree. It should be
closed with btr_pcur_close. */
UNIV_INLINE
void
-btr_pcur_open(
-/*==========*/
+btr_pcur_open_func(
+/*===============*/
dict_index_t* index, /*!< in: index */
const dtuple_t* tuple, /*!< in: tuple on which search done */
ulint mode, /*!< in: PAGE_CUR_L, ...;
@@ -495,6 +478,8 @@ btr_pcur_open(
record! */
ulint latch_mode,/*!< in: BTR_SEARCH_LEAF, ... */
btr_pcur_t* cursor, /*!< in: memory buffer for persistent cursor */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
mtr_t* mtr) /*!< in: mtr */
{
btr_cur_t* btr_cursor;
@@ -511,7 +496,7 @@ btr_pcur_open(
btr_cursor = btr_pcur_get_btr_cur(cursor);
btr_cur_search_to_nth_level(index, 0, tuple, mode, latch_mode,
- btr_cursor, 0, mtr);
+ btr_cursor, 0, file, line, mtr);
cursor->pos_state = BTR_PCUR_IS_POSITIONED;
cursor->trx_if_known = NULL;
@@ -522,8 +507,8 @@ Opens an persistent cursor to an index tree without initializing the
cursor. */
UNIV_INLINE
void
-btr_pcur_open_with_no_init(
-/*=======================*/
+btr_pcur_open_with_no_init_func(
+/*============================*/
dict_index_t* index, /*!< in: index */
const dtuple_t* tuple, /*!< in: tuple on which search done */
ulint mode, /*!< in: PAGE_CUR_L, ...;
@@ -541,6 +526,8 @@ btr_pcur_open_with_no_init(
ulint has_search_latch,/*!< in: latch mode the caller
currently has on btr_search_latch:
RW_S_LATCH, or 0 */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
mtr_t* mtr) /*!< in: mtr */
{
btr_cur_t* btr_cursor;
@@ -553,7 +540,8 @@ btr_pcur_open_with_no_init(
btr_cursor = btr_pcur_get_btr_cur(cursor);
btr_cur_search_to_nth_level(index, 0, tuple, mode, latch_mode,
- btr_cursor, has_search_latch, mtr);
+ btr_cursor, has_search_latch,
+ file, line, mtr);
cursor->pos_state = BTR_PCUR_IS_POSITIONED;
cursor->old_stored = BTR_PCUR_OLD_NOT_STORED;
@@ -600,11 +588,13 @@ btr_pcur_open_at_index_side(
Positions a cursor at a randomly chosen position within a B-tree. */
UNIV_INLINE
void
-btr_pcur_open_at_rnd_pos(
-/*=====================*/
+btr_pcur_open_at_rnd_pos_func(
+/*==========================*/
dict_index_t* index, /*!< in: index */
ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */
btr_pcur_t* cursor, /*!< in/out: B-tree pcur */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
mtr_t* mtr) /*!< in: mtr */
{
/* Initialize the cursor */
@@ -614,8 +604,9 @@ btr_pcur_open_at_rnd_pos(
btr_pcur_init(cursor);
- btr_cur_open_at_rnd_pos(index, latch_mode,
- btr_pcur_get_btr_cur(cursor), mtr);
+ btr_cur_open_at_rnd_pos_func(index, latch_mode,
+ btr_pcur_get_btr_cur(cursor),
+ file, line, mtr);
cursor->pos_state = BTR_PCUR_IS_POSITIONED;
cursor->old_stored = BTR_PCUR_OLD_NOT_STORED;
diff --git a/storage/xtradb/include/buf0buf.h b/storage/xtradb/include/buf0buf.h
index c95fcc00dd3..f93510be6d6 100644
--- a/storage/xtradb/include/buf0buf.h
+++ b/storage/xtradb/include/buf0buf.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -33,6 +33,7 @@ Created 11/5/1995 Heikki Tuuri
#include "hash0hash.h"
#include "ut0byte.h"
#include "page0types.h"
+#include "ut0rbt.h"
#ifndef UNIV_HOTBACKUP
#include "os0proc.h"
@@ -202,20 +203,14 @@ with care. */
#define buf_page_get_with_no_latch(SP, ZS, OF, MTR) buf_page_get_gen(\
SP, ZS, OF, RW_NO_LATCH, NULL,\
BUF_GET_NO_LATCH, __FILE__, __LINE__, MTR)
-/**************************************************************//**
-NOTE! The following macros should be used instead of
-buf_page_optimistic_get_func, to improve debugging. Only values RW_S_LATCH and
-RW_X_LATCH are allowed as LA! */
-#define buf_page_optimistic_get(LA, BL, MC, MTR) \
- buf_page_optimistic_get_func(LA, BL, MC, __FILE__, __LINE__, MTR)
/********************************************************************//**
This is the general function used to get optimistic access to a database
page.
@return TRUE if success */
UNIV_INTERN
ibool
-buf_page_optimistic_get_func(
-/*=========================*/
+buf_page_optimistic_get(
+/*====================*/
ulint rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH */
buf_block_t* block, /*!< in: guessed block */
ib_uint64_t modify_clock,/*!< in: modify clock value if mode is
@@ -1205,15 +1200,21 @@ struct buf_block_struct{
rw_lock_t lock; /*!< read-write lock of the buffer
frame */
unsigned lock_hash_val:32;/*!< hashed value of the page address
- in the record lock hash table */
- unsigned check_index_page_at_flush:1;
+ in the record lock hash table;
+ protected by buf_block_t::lock
+ (or buf_block_t::mutex, buf_pool_mutex
+ in buf_page_get_gen(),
+ buf_page_init_for_read()
+ and buf_page_create()) */
+ ibool check_index_page_at_flush;
/*!< TRUE if we know that this is
an index page, and want the database
to check its consistency before flush;
note that there may be pages in the
buffer pool which are index pages,
but this flag is not set because
- we do not keep track of all pages */
+ we do not keep track of all pages;
+ NOT protected by any mutex */
/* @} */
/** @name Optimistic search field */
/* @{ */
@@ -1379,6 +1380,19 @@ struct buf_pool_struct{
/*!< this is in the set state
when there is no flush batch
of the given type running */
+ ib_rbt_t* flush_rbt; /* !< a red-black tree is used
+ exclusively during recovery to
+ speed up insertions in the
+ flush_list. This tree contains
+ blocks in order of
+ oldest_modification LSN and is
+ kept in sync with the
+ flush_list.
+ Each member of the tree MUST
+ also be on the flush_list.
+ This tree is relevant only in
+ recovery and is set to NULL
+ once the recovery is over. */
ulint freed_page_clock;/*!< a sequence number used
to count the number of buffer
blocks removed from the end of
diff --git a/storage/xtradb/include/buf0buf.ic b/storage/xtradb/include/buf0buf.ic
index c9809fbdcd8..93cc68e7fc9 100644
--- a/storage/xtradb/include/buf0buf.ic
+++ b/storage/xtradb/include/buf0buf.ic
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
Copyright (c) 2008, Google Inc.
Portions of this file contain modifications contributed and copyrighted by
@@ -81,7 +81,7 @@ buf_page_peek_if_too_old(
unsigned access_time = buf_page_is_accessed(bpage);
if (access_time > 0
- && (ut_time_ms() - access_time)
+ && ((ib_uint32_t) (ut_time_ms() - access_time))
>= buf_LRU_old_threshold_ms) {
return(TRUE);
}
@@ -743,6 +743,12 @@ buf_block_get_lock_hash_val(
/*========================*/
const buf_block_t* block) /*!< in: block */
{
+ ut_ad(block);
+ ut_ad(buf_page_in_file(&block->page));
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(rw_lock_own(&(((buf_block_t*) block)->lock), RW_LOCK_EXCLUSIVE)
+ || rw_lock_own(&(((buf_block_t*) block)->lock), RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
return(block->lock_hash_val);
}
@@ -967,7 +973,12 @@ buf_page_hash_get(
ut_a(buf_page_in_file(bpage));
ut_ad(bpage->in_page_hash);
ut_ad(!bpage->in_zip_hash);
+#if UNIV_WORD_SIZE == 4
+ /* On 32-bit systems, there is no padding in
+ buf_page_t. On other systems, Valgrind could complain
+ about uninitialized pad bytes. */
UNIV_MEM_ASSERT_RW(bpage, sizeof *bpage);
+#endif
}
return(bpage);
diff --git a/storage/xtradb/include/buf0flu.h b/storage/xtradb/include/buf0flu.h
index cac4bf9fe4b..2f7108fda1b 100644
--- a/storage/xtradb/include/buf0flu.h
+++ b/storage/xtradb/include/buf0flu.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -40,6 +40,16 @@ buf_flush_remove(
/*=============*/
buf_page_t* bpage); /*!< in: pointer to the block in question */
/********************************************************************//**
+Relocates a buffer control block on the flush_list.
+Note that it is assumed that the contents of bpage has already been
+copied to dpage. */
+UNIV_INTERN
+void
+buf_flush_relocate_on_flush_list(
+/*=============================*/
+ buf_page_t* bpage, /*!< in/out: control block being moved */
+ buf_page_t* dpage); /*!< in/out: destination block */
+/********************************************************************//**
Updates the flush system data structures when a write is completed. */
UNIV_INTERN
void
@@ -140,8 +150,8 @@ how much redo the workload is generating and at what rate. */
struct buf_flush_stat_struct
{
- ib_uint64_t redo; /**< amount of redo generated. */
- ulint n_flushed; /**< number of pages flushed. */
+ ib_uint64_t redo; /*!< amount of redo generated. */
+ ulint n_flushed; /*!< number of pages flushed. */
};
/** Statistics for selecting flush rate of dirty pages. */
@@ -176,6 +186,22 @@ buf_flush_validate(void);
/*====================*/
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+/******************************************************************//**
+Initialize the red-black tree to speed up insertions into the flush_list
+during recovery process. Should be called at the start of recovery
+process before any page has been read/written. */
+UNIV_INTERN
+void
+buf_flush_init_flush_rbt(void);
+/*==========================*/
+
+/******************************************************************//**
+Frees up the red-black tree. */
+UNIV_INTERN
+void
+buf_flush_free_flush_rbt(void);
+/*==========================*/
+
/** When buf_flush_free_margin is called, it tries to make this many blocks
available to replacement in the free list and at the end of the LRU list (to
make sure that a read-ahead batch can be read efficiently in a single
diff --git a/storage/xtradb/include/data0type.ic b/storage/xtradb/include/data0type.ic
index 240b4288f39..2bf67a941bd 100644
--- a/storage/xtradb/include/data0type.ic
+++ b/storage/xtradb/include/data0type.ic
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -285,6 +285,10 @@ dtype_new_store_for_order_and_null_size(
#endif
ulint len;
+ ut_ad(type);
+ ut_ad(type->mtype >= DATA_VARCHAR);
+ ut_ad(type->mtype <= DATA_MYSQL);
+
buf[0] = (byte)(type->mtype & 0xFFUL);
if (type->prtype & DATA_BINARY_TYPE) {
diff --git a/storage/xtradb/include/dict0boot.h b/storage/xtradb/include/dict0boot.h
index 51d37ee98d1..1a13bd1503a 100644
--- a/storage/xtradb/include/dict0boot.h
+++ b/storage/xtradb/include/dict0boot.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -137,6 +137,7 @@ clustered index */
#define DICT_SYS_INDEXES_PAGE_NO_FIELD 8
#define DICT_SYS_INDEXES_SPACE_NO_FIELD 7
#define DICT_SYS_INDEXES_TYPE_FIELD 6
+#define DICT_SYS_INDEXES_NAME_FIELD 4
/* When a row id which is zero modulo this number (which must be a power of
two) is assigned, the field DICT_HDR_ROW_ID on the dictionary header page is
diff --git a/storage/xtradb/include/dict0dict.h b/storage/xtradb/include/dict0dict.h
index 8272bfe2422..0879e91ab33 100644
--- a/storage/xtradb/include/dict0dict.h
+++ b/storage/xtradb/include/dict0dict.h
@@ -928,9 +928,10 @@ UNIV_INTERN
void
dict_table_check_for_dup_indexes(
/*=============================*/
- const dict_table_t* table); /*!< in: Check for dup indexes
+ const dict_table_t* table, /*!< in: Check for dup indexes
in this table */
-
+ ibool tmp_ok);/*!< in: TRUE=allow temporary
+ index names */
#endif /* UNIV_DEBUG */
/**********************************************************************//**
Builds a node pointer out of a physical record and a page number.
@@ -1060,6 +1061,22 @@ UNIV_INTERN
void
dict_mutex_exit_for_mysql(void);
/*===========================*/
+/**********************************************************************//**
+Lock the appropriate mutex to protect index->stat_n_diff_key_vals[].
+index->id is used to pick the right mutex and it should not change
+before dict_index_stat_mutex_exit() is called on this index. */
+UNIV_INTERN
+void
+dict_index_stat_mutex_enter(
+/*========================*/
+ const dict_index_t* index); /*!< in: index */
+/**********************************************************************//**
+Unlock the appropriate mutex that protects index->stat_n_diff_key_vals[]. */
+UNIV_INTERN
+void
+dict_index_stat_mutex_exit(
+/*=======================*/
+ const dict_index_t* index); /*!< in: index */
/********************************************************************//**
Checks if the database name in two table names is the same.
@return TRUE if same db name */
diff --git a/storage/xtradb/include/dict0mem.h b/storage/xtradb/include/dict0mem.h
index 603ecfda3f9..ee3107a3be1 100644
--- a/storage/xtradb/include/dict0mem.h
+++ b/storage/xtradb/include/dict0mem.h
@@ -80,21 +80,39 @@ combination of types */
/** File format */
/* @{ */
#define DICT_TF_FORMAT_SHIFT 5 /* file format */
-#define DICT_TF_FORMAT_MASK (127 << DICT_TF_FORMAT_SHIFT)
+#define DICT_TF_FORMAT_MASK \
+((~(~0 << (DICT_TF_BITS - DICT_TF_FORMAT_SHIFT))) << DICT_TF_FORMAT_SHIFT)
#define DICT_TF_FORMAT_51 0 /*!< InnoDB/MySQL up to 5.1 */
#define DICT_TF_FORMAT_ZIP 1 /*!< InnoDB plugin for 5.1:
compressed tables,
new BLOB treatment */
/** Maximum supported file format */
#define DICT_TF_FORMAT_MAX DICT_TF_FORMAT_ZIP
-
+/* @} */
#define DICT_TF_BITS 6 /*!< number of flag bits */
#if (1 << (DICT_TF_BITS - DICT_TF_FORMAT_SHIFT)) <= DICT_TF_FORMAT_MAX
# error "DICT_TF_BITS is insufficient for DICT_TF_FORMAT_MAX"
#endif
/* @} */
+
+/** @brief Additional table flags.
+
+These flags will be stored in SYS_TABLES.MIX_LEN. All unused flags
+will be written as 0. The column may contain garbage for tables
+created with old versions of InnoDB that only implemented
+ROW_FORMAT=REDUNDANT. */
+/* @{ */
+#define DICT_TF2_SHIFT DICT_TF_BITS
+ /*!< Shift value for
+ table->flags. */
+#define DICT_TF2_TEMPORARY 1 /*!< TRUE for tables from
+ CREATE TEMPORARY TABLE. */
+#define DICT_TF2_BITS (DICT_TF2_SHIFT + 1)
+ /*!< Total number of bits
+ in table->flags. */
/* @} */
+
/**********************************************************************//**
Creates a table memory object.
@return own: table object */
@@ -374,7 +392,7 @@ struct dict_table_struct{
unsigned space:32;
/*!< space where the clustered index of the
table is placed */
- unsigned flags:DICT_TF_BITS;/*!< DICT_TF_COMPACT, ... */
+ unsigned flags:DICT_TF2_BITS;/*!< DICT_TF_COMPACT, ... */
unsigned ibd_file_missing:1;
/*!< TRUE if this is in a single-table
tablespace and the .ibd file is missing; then
diff --git a/storage/xtradb/include/fil0fil.h b/storage/xtradb/include/fil0fil.h
index bacaafa1c72..163cacf2892 100644
--- a/storage/xtradb/include/fil0fil.h
+++ b/storage/xtradb/include/fil0fil.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -110,9 +110,10 @@ extern fil_addr_t fil_addr_null;
contents of this field is valid
for all uncompressed pages. */
#define FIL_PAGE_FILE_FLUSH_LSN 26 /*!< this is only defined for the
- first page in a data file: the file
- has been flushed to disk at least up
- to this lsn */
+ first page in a system tablespace
+ data file (ibdata*, not *.ibd):
+ the file has been flushed to disk
+ at least up to this lsn */
#define FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID 34 /*!< starting from 4.1.x this
contains the space id of the page */
#define FIL_PAGE_DATA 38 /*!< start of the data on the page */
diff --git a/storage/xtradb/include/hash0hash.h b/storage/xtradb/include/hash0hash.h
index 977cb829f35..b17c21a45ef 100644
--- a/storage/xtradb/include/hash0hash.h
+++ b/storage/xtradb/include/hash0hash.h
@@ -434,11 +434,12 @@ struct hash_table_struct {
these heaps */
#endif /* !UNIV_HOTBACKUP */
mem_heap_t* heap;
+#ifdef UNIV_DEBUG
ulint magic_n;
+# define HASH_TABLE_MAGIC_N 76561114
+#endif /* UNIV_DEBUG */
};
-#define HASH_TABLE_MAGIC_N 76561114
-
#ifndef UNIV_NONINL
#include "hash0hash.ic"
#endif
diff --git a/storage/xtradb/include/hash0hash.ic b/storage/xtradb/include/hash0hash.ic
index 19da2d50701..0b437894e2e 100644
--- a/storage/xtradb/include/hash0hash.ic
+++ b/storage/xtradb/include/hash0hash.ic
@@ -35,6 +35,8 @@ hash_get_nth_cell(
hash_table_t* table, /*!< in: hash table */
ulint n) /*!< in: cell index */
{
+ ut_ad(table);
+ ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
ut_ad(n < table->n_cells);
return(table->array + n);
@@ -48,6 +50,8 @@ hash_table_clear(
/*=============*/
hash_table_t* table) /*!< in/out: hash table */
{
+ ut_ad(table);
+ ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
memset(table->array, 0x0,
table->n_cells * sizeof(*table->array));
}
@@ -61,6 +65,8 @@ hash_get_n_cells(
/*=============*/
hash_table_t* table) /*!< in: table */
{
+ ut_ad(table);
+ ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
return(table->n_cells);
}
@@ -74,6 +80,8 @@ hash_calc_hash(
ulint fold, /*!< in: folded value */
hash_table_t* table) /*!< in: hash table */
{
+ ut_ad(table);
+ ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
return(ut_hash_ulint(fold, table->n_cells));
}
@@ -88,6 +96,8 @@ hash_get_mutex_no(
hash_table_t* table, /*!< in: hash table */
ulint fold) /*!< in: fold */
{
+ ut_ad(table);
+ ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
ut_ad(ut_is_2pow(table->n_mutexes));
return(ut_2pow_remainder(hash_calc_hash(fold, table),
table->n_mutexes));
@@ -103,6 +113,8 @@ hash_get_nth_heap(
hash_table_t* table, /*!< in: hash table */
ulint i) /*!< in: index of the heap */
{
+ ut_ad(table);
+ ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
ut_ad(i < table->n_mutexes);
return(table->heaps[i]);
@@ -120,6 +132,9 @@ hash_get_heap(
{
ulint i;
+ ut_ad(table);
+ ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
+
if (table->heap) {
return(table->heap);
}
@@ -139,6 +154,8 @@ hash_get_nth_mutex(
hash_table_t* table, /*!< in: hash table */
ulint i) /*!< in: index of the mutex */
{
+ ut_ad(table);
+ ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
ut_ad(i < table->n_mutexes);
return(table->mutexes + i);
@@ -156,6 +173,9 @@ hash_get_mutex(
{
ulint i;
+ ut_ad(table);
+ ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
+
i = hash_get_mutex_no(table, fold);
return(hash_get_nth_mutex(table, i));
diff --git a/storage/xtradb/include/lock0lock.h b/storage/xtradb/include/lock0lock.h
index 82e4c9bd976..89a6977b589 100644
--- a/storage/xtradb/include/lock0lock.h
+++ b/storage/xtradb/include/lock0lock.h
@@ -43,6 +43,7 @@ extern ibool lock_print_waits;
#endif /* UNIV_DEBUG */
/* Buffer for storing information about the most recent deadlock error */
extern FILE* lock_latest_err_file;
+extern ulint srv_n_lock_deadlock_count;
/*********************************************************************//**
Gets the size of a lock struct.
@@ -613,13 +614,16 @@ lock_rec_print(
FILE* file, /*!< in: file where to print */
const lock_t* lock); /*!< in: record type lock */
/*********************************************************************//**
-Prints info of locks for all transactions. */
+Prints info of locks for all transactions.
+@return FALSE if not able to obtain kernel mutex
+and exits without printing info */
UNIV_INTERN
-void
+ibool
lock_print_info_summary(
/*====================*/
- FILE* file); /*!< in: file where to print */
-/*********************************************************************//**
+ FILE* file, /*!< in: file where to print */
+ ibool nowait);/*!< in: whether to wait for the kernel mutex */
+/*************************************************************************
Prints info of locks for each transaction. */
UNIV_INTERN
void
diff --git a/storage/xtradb/include/log0log.h b/storage/xtradb/include/log0log.h
index 135aeb69e2d..8fce4ef96bc 100644
--- a/storage/xtradb/include/log0log.h
+++ b/storage/xtradb/include/log0log.h
@@ -1,23 +1,6 @@
/*****************************************************************************
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
-
-This program is free software; you can redistribute it and/or modify it under
-the terms of the GNU General Public License as published by the Free Software
-Foundation; version 2 of the License.
-
-This program is distributed in the hope that it will be useful, but WITHOUT
-ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
-
-*****************************************************************************/
-/*****************************************************************************
-
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
Copyright (c) 2009, Google Inc.
Portions of this file contain modifications contributed and copyrighted by
@@ -825,7 +808,17 @@ struct log_struct{
written to some log group; for this to
be advanced, it is enough that the
write i/o has been completed for all
- log groups */
+ log groups.
+ Note that since InnoDB currently
+ has only one log group therefore
+ this value is redundant. Also it
+ is possible that this value
+ falls behind the
+ flushed_to_disk_lsn transiently.
+ It is appropriate to use either
+ flushed_to_disk_lsn or
+ write_lsn which are always
+ up-to-date and accurate. */
ib_uint64_t write_lsn; /*!< end lsn for the current running
write */
ulint write_end_offset;/*!< the data in buffer has
diff --git a/storage/xtradb/include/log0log.ic b/storage/xtradb/include/log0log.ic
index 36d151a3064..139f4041a36 100644
--- a/storage/xtradb/include/log0log.ic
+++ b/storage/xtradb/include/log0log.ic
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -314,12 +314,15 @@ log_reserve_and_write_fast(
ulint data_len;
#ifdef UNIV_LOG_LSN_DEBUG
/* length of the LSN pseudo-record */
- ulint lsn_len = 1
- + mach_get_compressed_size(log_sys->lsn >> 32)
- + mach_get_compressed_size(log_sys->lsn & 0xFFFFFFFFUL);
+ ulint lsn_len;
#endif /* UNIV_LOG_LSN_DEBUG */
mutex_enter(&log_sys->mutex);
+#ifdef UNIV_LOG_LSN_DEBUG
+ lsn_len = 1
+ + mach_get_compressed_size(log_sys->lsn >> 32)
+ + mach_get_compressed_size(log_sys->lsn & 0xFFFFFFFFUL);
+#endif /* UNIV_LOG_LSN_DEBUG */
data_len = len
#ifdef UNIV_LOG_LSN_DEBUG
diff --git a/storage/xtradb/include/log0recv.h b/storage/xtradb/include/log0recv.h
index ac6b19a3f6a..15065267250 100644
--- a/storage/xtradb/include/log0recv.h
+++ b/storage/xtradb/include/log0recv.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -176,6 +176,12 @@ UNIV_INTERN
void
recv_recovery_from_checkpoint_finish(void);
/*======================================*/
+/********************************************************//**
+Initiates the rollback of active transactions. */
+UNIV_INTERN
+void
+recv_recovery_rollback_active(void);
+/*===============================*/
/*******************************************************//**
Scans log from a buffer and stores new log data to the parsing buffer.
Parses and hashes the log records if new data found. Unless
@@ -258,12 +264,14 @@ void
recv_sys_init(
/*==========*/
ulint available_memory); /*!< in: available memory in bytes */
+#ifndef UNIV_HOTBACKUP
/********************************************************//**
Reset the state of the recovery system variables. */
UNIV_INTERN
void
recv_sys_var_init(void);
/*===================*/
+#endif /* !UNIV_HOTBACKUP */
/*******************************************************************//**
Empties the hash table of stored log records, applying them to appropriate
pages. */
@@ -360,8 +368,8 @@ typedef struct recv_addr_struct recv_addr_t;
struct recv_addr_struct{
enum recv_addr_state state;
/*!< recovery state of the page */
- ulint space; /*!< space id */
- ulint page_no;/*!< page number */
+ unsigned space:32;/*!< space id */
+ unsigned page_no:32;/*!< page number */
UT_LIST_BASE_NODE_T(recv_t)
rec_list;/*!< list of log records for this page */
hash_node_t addr_hash;/*!< hash node in the hash bucket chain */
diff --git a/storage/xtradb/include/mem0dbg.h b/storage/xtradb/include/mem0dbg.h
index a064af5c678..d81e1418b2b 100644
--- a/storage/xtradb/include/mem0dbg.h
+++ b/storage/xtradb/include/mem0dbg.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -28,6 +28,13 @@ Created 6/9/1994 Heikki Tuuri
check fields whose sizes are given below */
#ifdef UNIV_MEM_DEBUG
+# ifndef UNIV_HOTBACKUP
+/* The mutex which protects in the debug version the hash table
+containing the list of live memory heaps, and also the global
+variables in mem0dbg.c. */
+extern mutex_t mem_hash_mutex;
+# endif /* !UNIV_HOTBACKUP */
+
#define MEM_FIELD_HEADER_SIZE ut_calc_align(2 * sizeof(ulint),\
UNIV_MEM_ALIGNMENT)
#define MEM_FIELD_TRAILER_SIZE sizeof(ulint)
diff --git a/storage/xtradb/include/mem0dbg.ic b/storage/xtradb/include/mem0dbg.ic
index cb9245411dc..b0c8178a623 100644
--- a/storage/xtradb/include/mem0dbg.ic
+++ b/storage/xtradb/include/mem0dbg.ic
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -25,9 +25,6 @@ Created 6/8/1994 Heikki Tuuri
*************************************************************************/
#ifdef UNIV_MEM_DEBUG
-# ifndef UNIV_HOTBACKUP
-extern mutex_t mem_hash_mutex;
-# endif /* !UNIV_HOTBACKUP */
extern ulint mem_current_allocated_memory;
/******************************************************************//**
diff --git a/storage/xtradb/include/mem0mem.h b/storage/xtradb/include/mem0mem.h
index 98f8748e529..ee28cf7b225 100644
--- a/storage/xtradb/include/mem0mem.h
+++ b/storage/xtradb/include/mem0mem.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -359,6 +359,9 @@ struct mem_block_info_struct {
to the heap is also the first block in this list,
though it also contains the base node of the list. */
ulint len; /*!< physical length of this block in bytes */
+ ulint total_size; /* physical length in bytes of all blocks
+ in the heap. This is defined only in the base
+ node and is set to ULINT_UNDEFINED in others. */
ulint type; /*!< type of heap: MEM_HEAP_DYNAMIC, or
MEM_HEAP_BUF possibly ORed to MEM_HEAP_BTR_SEARCH */
ulint free; /*!< offset in bytes of the first free position for
diff --git a/storage/xtradb/include/mem0mem.ic b/storage/xtradb/include/mem0mem.ic
index e7080d8c508..cbce2edc661 100644
--- a/storage/xtradb/include/mem0mem.ic
+++ b/storage/xtradb/include/mem0mem.ic
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -579,18 +579,12 @@ mem_heap_get_size(
/*==============*/
mem_heap_t* heap) /*!< in: heap */
{
- mem_block_t* block;
ulint size = 0;
ut_ad(mem_heap_check(heap));
- block = heap;
-
- while (block != NULL) {
+ size = heap->total_size;
- size += mem_block_get_len(block);
- block = UT_LIST_GET_NEXT(list, block);
- }
#ifndef UNIV_HOTBACKUP
if (heap->free_block) {
size += UNIV_PAGE_SIZE;
diff --git a/storage/xtradb/include/mtr0log.ic b/storage/xtradb/include/mtr0log.ic
index 5c24c38b337..db017c7d16e 100644
--- a/storage/xtradb/include/mtr0log.ic
+++ b/storage/xtradb/include/mtr0log.ic
@@ -27,8 +27,8 @@ Created 12/7/1995 Heikki Tuuri
#include "ut0lst.h"
#include "buf0buf.h"
#include "fsp0types.h"
+#include "srv0srv.h"
#include "trx0sys.h"
-
/********************************************************//**
Opens a buffer to mlog. It must be closed with mlog_close.
@return buffer, NULL if log mode MTR_LOG_NONE */
@@ -201,7 +201,8 @@ mlog_write_initial_log_record_fast(
the doublewrite buffer is located in pages
FSP_EXTENT_SIZE, ..., 3 * FSP_EXTENT_SIZE - 1 in the
system tablespace */
- if (space == TRX_SYS_SPACE
+ if ((space == TRX_SYS_SPACE
+ || (srv_doublewrite_file && space == TRX_DOUBLEWRITE_SPACE))
&& offset >= FSP_EXTENT_SIZE && offset < 3 * FSP_EXTENT_SIZE) {
if (trx_doublewrite_buf_is_being_created) {
/* Do nothing: we only come to this branch in an
diff --git a/storage/xtradb/include/mtr0mtr.ic b/storage/xtradb/include/mtr0mtr.ic
index 310c7c4117f..18f8e87b3cf 100644
--- a/storage/xtradb/include/mtr0mtr.ic
+++ b/storage/xtradb/include/mtr0mtr.ic
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -70,6 +70,7 @@ mtr_memo_push(
ut_ad(type <= MTR_MEMO_X_LOCK);
ut_ad(mtr);
ut_ad(mtr->magic_n == MTR_MAGIC_N);
+ ut_ad(mtr->state == MTR_ACTIVE);
memo = &(mtr->memo);
@@ -92,6 +93,7 @@ mtr_set_savepoint(
ut_ad(mtr);
ut_ad(mtr->magic_n == MTR_MAGIC_N);
+ ut_ad(mtr->state == MTR_ACTIVE);
memo = &(mtr->memo);
@@ -149,6 +151,7 @@ mtr_memo_contains(
ut_ad(mtr);
ut_ad(mtr->magic_n == MTR_MAGIC_N);
+ ut_ad(mtr->state == MTR_ACTIVE || mtr->state == MTR_COMMITTING);
memo = &(mtr->memo);
diff --git a/storage/xtradb/include/os0file.h b/storage/xtradb/include/os0file.h
index 3eff5216867..eeab8a2b5d9 100644
--- a/storage/xtradb/include/os0file.h
+++ b/storage/xtradb/include/os0file.h
@@ -1,23 +1,6 @@
-/*****************************************************************************
-
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
-
-This program is free software; you can redistribute it and/or modify it under
-the terms of the GNU General Public License as published by the Free Software
-Foundation; version 2 of the License.
-
-This program is distributed in the hope that it will be useful, but WITHOUT
-ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
-
-*****************************************************************************/
/***********************************************************************
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
Copyright (c) 2009, Percona Inc.
Portions of this file contain modifications contributed and copyrighted
diff --git a/storage/xtradb/include/que0que.h b/storage/xtradb/include/que0que.h
index 420f34550e2..39f8d07af89 100644
--- a/storage/xtradb/include/que0que.h
+++ b/storage/xtradb/include/que0que.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -30,6 +30,7 @@ Created 5/27/1996 Heikki Tuuri
#include "data0data.h"
#include "dict0types.h"
#include "trx0trx.h"
+#include "trx0roll.h"
#include "srv0srv.h"
#include "usr0types.h"
#include "que0types.h"
@@ -215,6 +216,16 @@ trx_t*
thr_get_trx(
/*========*/
que_thr_t* thr); /*!< in: query thread */
+/*******************************************************************//**
+Determines if this thread is rolling back an incomplete transaction
+in crash recovery.
+@return TRUE if thr is rolling back an incomplete transaction in crash
+recovery */
+UNIV_INLINE
+ibool
+thr_is_recv(
+/*========*/
+ const que_thr_t* thr); /*!< in: query thread */
/***********************************************************************//**
Gets the type of a graph node. */
UNIV_INLINE
diff --git a/storage/xtradb/include/que0que.ic b/storage/xtradb/include/que0que.ic
index a1c0dc1e77a..bd936670e1e 100644
--- a/storage/xtradb/include/que0que.ic
+++ b/storage/xtradb/include/que0que.ic
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -38,6 +38,20 @@ thr_get_trx(
return(thr->graph->trx);
}
+/*******************************************************************//**
+Determines if this thread is rolling back an incomplete transaction
+in crash recovery.
+@return TRUE if thr is rolling back an incomplete transaction in crash
+recovery */
+UNIV_INLINE
+ibool
+thr_is_recv(
+/*========*/
+ const que_thr_t* thr) /*!< in: query thread */
+{
+ return(trx_is_recv(thr->graph->trx));
+}
+
/***********************************************************************//**
Gets the first thr in a fork. */
UNIV_INLINE
diff --git a/storage/xtradb/include/row0mysql.h b/storage/xtradb/include/row0mysql.h
index b05241f00f8..d2a8734c61f 100644
--- a/storage/xtradb/include/row0mysql.h
+++ b/storage/xtradb/include/row0mysql.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 2000, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2000, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -451,6 +451,12 @@ row_drop_table_for_mysql(
const char* name, /*!< in: table name */
trx_t* trx, /*!< in: transaction handle */
ibool drop_db);/*!< in: TRUE=dropping whole database */
+/*********************************************************************//**
+Drop all temporary tables during crash recovery. */
+UNIV_INTERN
+void
+row_mysql_drop_temp_tables(void);
+/*============================*/
/*********************************************************************//**
Discards the tablespace of a table which stored in an .ibd file. Discarding
@@ -494,14 +500,19 @@ row_rename_table_for_mysql(
trx_t* trx, /*!< in: transaction handle */
ibool commit); /*!< in: if TRUE then commit trx */
/*********************************************************************//**
-Checks a table for corruption.
-@return DB_ERROR or DB_SUCCESS */
+Checks that the index contains entries in an ascending order, unique
+constraint is not broken, and calculates the number of index entries
+in the read view of the current transaction.
+@return DB_SUCCESS if ok */
UNIV_INTERN
ulint
-row_check_table_for_mysql(
+row_check_index_for_mysql(
/*======================*/
- row_prebuilt_t* prebuilt); /*!< in: prebuilt struct in MySQL
- handle */
+ row_prebuilt_t* prebuilt, /*!< in: prebuilt struct
+ in MySQL handle */
+ const dict_index_t* index, /*!< in: index */
+ ulint* n_rows); /*!< out: number of entries
+ seen in the consistent read */
/*********************************************************************//**
Determines if a table is a magic monitor table.
diff --git a/storage/xtradb/include/row0sel.h b/storage/xtradb/include/row0sel.h
index 01a5afaa23e..8544b9d08ba 100644
--- a/storage/xtradb/include/row0sel.h
+++ b/storage/xtradb/include/row0sel.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -105,17 +105,6 @@ row_fetch_print(
/*============*/
void* row, /*!< in: sel_node_t* */
void* user_arg); /*!< in: not used */
-/****************************************************************//**
-Callback function for fetch that stores an unsigned 4 byte integer to the
-location pointed. The column's type must be DATA_INT, DATA_UNSIGNED, length
-= 4.
-@return always returns NULL */
-UNIV_INTERN
-void*
-row_fetch_store_uint4(
-/*==================*/
- void* row, /*!< in: sel_node_t* */
- void* user_arg); /*!< in: data pointer */
/***********************************************************//**
Prints a row in a select result.
@return query thread to run next or NULL */
diff --git a/storage/xtradb/include/srv0srv.h b/storage/xtradb/include/srv0srv.h
index 99ad3ad03d0..a463075f435 100644
--- a/storage/xtradb/include/srv0srv.h
+++ b/storage/xtradb/include/srv0srv.h
@@ -1,7 +1,8 @@
/*****************************************************************************
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
Copyright (c) 2008, 2009, Google Inc.
+Copyright (c) 2009, Percona Inc.
Portions of this file contain modifications contributed and copyrighted by
Google, Inc. Those modifications are gratefully acknowledged and are described
@@ -9,6 +10,13 @@ briefly in the InnoDB documentation. The contributions by Google are
incorporated with their permission, and subject to the conditions contained in
the file COPYING.Google.
+Portions of this file contain modifications contributed and copyrighted
+by Percona Inc.. Those modifications are
+gratefully acknowledged and are described briefly in the InnoDB
+documentation. The contributions by Percona Inc. are incorporated with
+their permission, and subject to the conditions contained in the file
+COPYING.Percona.
+
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
Foundation; version 2 of the License.
@@ -22,32 +30,6 @@ this program; if not, write to the Free Software Foundation, Inc., 59 Temple
Place, Suite 330, Boston, MA 02111-1307 USA
*****************************************************************************/
-/***********************************************************************
-
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
-Copyright (c) 2009, Percona Inc.
-
-Portions of this file contain modifications contributed and copyrighted
-by Percona Inc.. Those modifications are
-gratefully acknowledged and are described briefly in the InnoDB
-documentation. The contributions by Percona Inc. are incorporated with
-their permission, and subject to the conditions contained in the file
-COPYING.Percona.
-
-This program is free software; you can redistribute it and/or modify it
-under the terms of the GNU General Public License as published by the
-Free Software Foundation; version 2 of the License.
-
-This program is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
-Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-
-***********************************************************************/
/**************************************************//**
@file include/srv0srv.h
@@ -133,9 +115,10 @@ extern char** srv_data_file_names;
extern ulint* srv_data_file_sizes;
extern ulint* srv_data_file_is_raw_partition;
+extern char* srv_doublewrite_file;
+
extern ibool srv_extra_undoslots;
-extern ibool srv_fast_recovery;
extern ibool srv_recovery_stats;
extern ulint srv_use_purge_thread;
@@ -247,7 +230,6 @@ extern ulong srv_read_ahead;
extern ulong srv_adaptive_checkpoint;
extern ulong srv_expand_import;
-extern ulint srv_relax_table_creation;
extern ulint srv_pass_corrupt_table;
extern ulong srv_extra_rsegments;
@@ -265,7 +247,8 @@ extern ibool srv_print_innodb_tablespace_monitor;
extern ibool srv_print_verbose_log;
extern ibool srv_print_innodb_table_monitor;
-extern ibool srv_lock_timeout_and_monitor_active;
+extern ibool srv_lock_timeout_active;
+extern ibool srv_monitor_active;
extern ibool srv_error_monitor_active;
extern ulong srv_n_spin_wait_rounds;
@@ -595,15 +578,23 @@ srv_release_mysql_thread_if_suspended(
MySQL OS thread */
/*********************************************************************//**
A thread which wakes up threads whose lock wait may have lasted too long.
-This also prints the info output by various InnoDB monitors.
@return a dummy parameter */
UNIV_INTERN
os_thread_ret_t
-srv_lock_timeout_and_monitor_thread(
-/*================================*/
+srv_lock_timeout_thread(
+/*====================*/
void* arg); /*!< in: a dummy parameter required by
os_thread_create */
/*********************************************************************//**
+A thread which prints the info output by various InnoDB monitors.
+@return a dummy parameter */
+UNIV_INTERN
+os_thread_ret_t
+srv_monitor_thread(
+/*===============*/
+ void* arg); /*!< in: a dummy parameter required by
+ os_thread_create */
+/*************************************************************************
A thread which prints warnings about semaphore waits which have lasted
too long. These can be used to track bugs which cause hangs.
@return a dummy parameter */
@@ -614,12 +605,15 @@ srv_error_monitor_thread(
void* arg); /*!< in: a dummy parameter required by
os_thread_create */
/******************************************************************//**
-Outputs to a file the output of the InnoDB Monitor. */
+Outputs to a file the output of the InnoDB Monitor.
+@return FALSE if not all information printed
+due to failure to obtain necessary mutex */
UNIV_INTERN
-void
+ibool
srv_printf_innodb_monitor(
/*======================*/
FILE* file, /*!< in: output stream */
+ ibool nowait, /*!< in: whether to wait for kernel mutex */
ulint* trx_start, /*!< out: file position of the start of
the list of active transactions */
ulint* trx_end); /*!< out: file position of the end of
@@ -664,6 +658,7 @@ struct export_var_struct{
ulint innodb_buffer_pool_write_requests;/*!< srv_buf_pool_write_requests */
ulint innodb_buffer_pool_read_ahead; /*!< srv_read_ahead */
ulint innodb_buffer_pool_read_ahead_evicted;/*!< srv_read_ahead evicted*/
+ ulint innodb_deadlocks; /* ??? */
ulint innodb_dblwr_pages_written; /*!< srv_dblwr_pages_written */
ulint innodb_dblwr_writes; /*!< srv_dblwr_writes */
ibool innodb_have_atomic_builtins; /*!< HAVE_ATOMIC_BUILTINS */
diff --git a/storage/xtradb/include/sync0rw.h b/storage/xtradb/include/sync0rw.h
index 85fa014d77a..1fe517ab30a 100644
--- a/storage/xtradb/include/sync0rw.h
+++ b/storage/xtradb/include/sync0rw.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
Copyright (c) 2008, Google Inc.
Portions of this file contain modifications contributed and copyrighted by
@@ -429,8 +429,9 @@ ibool
rw_lock_own(
/*========*/
rw_lock_t* lock, /*!< in: rw-lock */
- ulint lock_type); /*!< in: lock type: RW_LOCK_SHARED,
+ ulint lock_type) /*!< in: lock type: RW_LOCK_SHARED,
RW_LOCK_EX */
+ __attribute__((warn_unused_result));
#endif /* UNIV_SYNC_DEBUG */
/******************************************************************//**
Checks if somebody has locked the rw-lock in the specified mode. */
diff --git a/storage/xtradb/include/sync0sync.h b/storage/xtradb/include/sync0sync.h
index c653e44b5bd..7e210ea82f1 100644
--- a/storage/xtradb/include/sync0sync.h
+++ b/storage/xtradb/include/sync0sync.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
Copyright (c) 2008, Google Inc.
Portions of this file contain modifications contributed and copyrighted by
@@ -206,7 +206,8 @@ UNIV_INTERN
ibool
mutex_own(
/*======*/
- const mutex_t* mutex); /*!< in: mutex */
+ const mutex_t* mutex) /*!< in: mutex */
+ __attribute__((warn_unused_result));
#endif /* UNIV_DEBUG */
#ifdef UNIV_SYNC_DEBUG
/******************************************************************//**
@@ -238,16 +239,27 @@ ibool
sync_thread_levels_empty(void);
/*==========================*/
/******************************************************************//**
-Checks that the level array for the current thread is empty.
-@return TRUE if empty except the exceptions specified below */
+Checks if the level array for the current thread contains a
+mutex or rw-latch at the specified level.
+@return a matching latch, or NULL if not found */
UNIV_INTERN
-ibool
-sync_thread_levels_empty_gen(
-/*=========================*/
+void*
+sync_thread_levels_contains(
+/*========================*/
+ ulint level); /*!< in: latching order level
+ (SYNC_DICT, ...)*/
+/******************************************************************//**
+Checks if the level array for the current thread is empty.
+@return a latch, or NULL if empty except the exceptions specified below */
+UNIV_INTERN
+void*
+sync_thread_levels_nonempty_gen(
+/*============================*/
ibool dict_mutex_allowed); /*!< in: TRUE if dictionary mutex is
allowed to be owned by the thread,
also purge_is_running mutex is
allowed */
+#define sync_thread_levels_empty_gen(d) (!sync_thread_levels_nonempty_gen(d))
/******************************************************************//**
Gets the debug information for a reserved mutex. */
UNIV_INTERN
diff --git a/storage/xtradb/include/trx0rseg.h b/storage/xtradb/include/trx0rseg.h
index 0d7dc60329f..303188f09f2 100644
--- a/storage/xtradb/include/trx0rseg.h
+++ b/storage/xtradb/include/trx0rseg.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
diff --git a/storage/xtradb/include/trx0sys.h b/storage/xtradb/include/trx0sys.h
index 13d93d5a77a..8b941cdd4e6 100644
--- a/storage/xtradb/include/trx0sys.h
+++ b/storage/xtradb/include/trx0sys.h
@@ -124,6 +124,22 @@ trx_sys_hdr_page(
/*=============*/
ulint space, /*!< in: space */
ulint page_no);/*!< in: page number */
+/***************************************************************//**
+Checks if a space is the system tablespaces.
+@return TRUE if system tablespace */
+UNIV_INLINE
+ibool
+trx_sys_sys_space(
+/*==============*/
+ ulint space); /*!< in: space */
+/***************************************************************//**
+Checks if a space is the doublewrite tablespace.
+@return TRUE if doublewrite tablespace */
+UNIV_INLINE
+ibool
+trx_sys_doublewrite_space(
+/*======================*/
+ ulint space); /*!< in: space */
/*****************************************************************//**
Creates and initializes the central memory structures for the transaction
system. This is called when the database is started. */
@@ -137,6 +153,13 @@ UNIV_INTERN
void
trx_sys_create(void);
/*================*/
+/*****************************************************************//**
+Creates and initializes the dummy transaction system page for tablespace. */
+UNIV_INTERN
+void
+trx_sys_dummy_create(
+/*=================*/
+ ulint space);
/*********************************************************************
Create extra rollback segments when create_new_db */
UNIV_INTERN
@@ -343,12 +366,14 @@ UNIV_INTERN
void
trx_sys_file_format_tag_init(void);
/*==============================*/
+#ifndef UNIV_HOTBACKUP
/*****************************************************************//**
Shutdown/Close the transaction system. */
UNIV_INTERN
void
trx_sys_close(void);
/*===============*/
+#endif /* !UNIV_HOTBACKUP */
/*****************************************************************//**
Get the name representation of the file format from its id.
@return pointer to the name */
@@ -444,6 +469,8 @@ trx_sys_file_format_id_to_name(
/* Space id and page no where the trx system file copy resides */
#define TRX_SYS_SPACE 0 /* the SYSTEM tablespace */
+#define TRX_DOUBLEWRITE_SPACE 1 /* the doublewrite buffer tablespace if used */
+#define TRX_SYS_SPACE_MAX 9 /* reserved max space id for system tablespaces */
#include "fsp0fsp.h"
#define TRX_SYS_PAGE_NO FSP_TRX_SYS_PAGE_NO
@@ -507,7 +534,6 @@ this contains the same fields as TRX_SYS_MYSQL_LOG_INFO below */
within that file */
#define TRX_SYS_MYSQL_LOG_NAME 12 /*!< MySQL log file name */
-#ifndef UNIV_HOTBACKUP
/** Doublewrite buffer */
/* @{ */
/** The offset of the doublewrite buffer header on the trx system header page */
@@ -559,6 +585,7 @@ FIL_PAGE_ARCH_LOG_NO_OR_SPACE_NO. */
#define TRX_SYS_DOUBLEWRITE_BLOCK_SIZE FSP_EXTENT_SIZE
/* @} */
+#ifndef UNIV_HOTBACKUP
/** File format tag */
/* @{ */
/** The offset of the file format tag on the trx system header page
diff --git a/storage/xtradb/include/trx0sys.ic b/storage/xtradb/include/trx0sys.ic
index 820d31d0692..c7b09d4aec2 100644
--- a/storage/xtradb/include/trx0sys.ic
+++ b/storage/xtradb/include/trx0sys.ic
@@ -71,6 +71,40 @@ trx_sys_hdr_page(
}
/***************************************************************//**
+Checks if a space is the system tablespaces.
+@return TRUE if system tablespace */
+UNIV_INLINE
+ibool
+trx_sys_sys_space(
+/*==============*/
+ ulint space) /*!< in: space */
+{
+ if (srv_doublewrite_file) {
+ /* several spaces are reserved */
+ return((ibool)(space <= TRX_SYS_SPACE_MAX));
+ } else {
+ return((ibool)(space == TRX_SYS_SPACE));
+ }
+}
+
+/***************************************************************//**
+Checks if a space is the doublewrite tablespace.
+@return TRUE if doublewrite tablespace */
+UNIV_INLINE
+ibool
+trx_sys_doublewrite_space(
+/*======================*/
+ ulint space) /*!< in: space */
+{
+ if (srv_doublewrite_file) {
+ /* doublewrite buffer is separated */
+ return((ibool)(space == TRX_DOUBLEWRITE_SPACE));
+ } else {
+ return((ibool)(space == TRX_SYS_SPACE));
+ }
+}
+
+/***************************************************************//**
Gets the pointer in the nth slot of the rseg array.
@return pointer to rseg object, NULL if slot not in use */
UNIV_INLINE
diff --git a/storage/xtradb/include/trx0trx.h b/storage/xtradb/include/trx0trx.h
index 3b845e498d0..4a4b54b93a0 100644
--- a/storage/xtradb/include/trx0trx.h
+++ b/storage/xtradb/include/trx0trx.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -338,9 +338,7 @@ trx_commit_step(
/**********************************************************************//**
Prints info about a transaction to the given file. The caller must own the
-kernel mutex and must have called
-innobase_mysql_prepare_print_arbitrary_thd(), unless he knows that MySQL
-or InnoDB cannot meanwhile change the info printed here. */
+kernel mutex. */
UNIV_INTERN
void
trx_print(
@@ -351,7 +349,7 @@ trx_print(
use the default max length */
/** Type of data dictionary operation */
-enum trx_dict_op {
+typedef enum trx_dict_op {
/** The transaction is not modifying the data dictionary. */
TRX_DICT_OP_NONE = 0,
/** The transaction is creating a table or an index, or
@@ -363,7 +361,7 @@ enum trx_dict_op {
existing table. In crash recovery, the data dictionary
must be locked, but the table must not be dropped. */
TRX_DICT_OP_INDEX = 2
-};
+} trx_dict_op_t;
/**********************************************************************//**
Determine if a transaction is a dictionary operation.
@@ -393,6 +391,14 @@ ibool
trx_is_interrupted(
/*===============*/
trx_t* trx); /*!< in: transaction */
+/**********************************************************************//**
+Determines if the currently running transaction is in strict mode.
+@return TRUE if strict */
+UNIV_INTERN
+ibool
+trx_is_strict(
+/*==========*/
+ trx_t* trx); /*!< in: transaction */
#else /* !UNIV_HOTBACKUP */
#define trx_is_interrupted(trx) FALSE
#endif /* !UNIV_HOTBACKUP */
@@ -465,69 +471,79 @@ rolling back after a database recovery */
struct trx_struct{
ulint magic_n;
- /* All the next fields are protected by the kernel mutex, except the
- undo logs which are protected by undo_mutex */
+
+ /* These fields are not protected by any mutex. */
const char* op_info; /*!< English text describing the
current operation, or an empty
string */
- unsigned is_purge:1; /*!< 0=user transaction, 1=purge */
- unsigned is_recovered:1; /*!< 0=normal transaction,
- 1=recovered, must be rolled back */
- unsigned conc_state:2; /*!< state of the trx from the point
+ ulint conc_state; /*!< state of the trx from the point
of view of concurrency control:
TRX_ACTIVE, TRX_COMMITTED_IN_MEMORY,
... */
- unsigned que_state:2; /*!< valid when conc_state == TRX_ACTIVE:
- TRX_QUE_RUNNING, TRX_QUE_LOCK_WAIT,
- ... */
- unsigned isolation_level:2;/* TRX_ISO_REPEATABLE_READ, ... */
- unsigned check_foreigns:1;/* normally TRUE, but if the user
+ ulint isolation_level;/* TRX_ISO_REPEATABLE_READ, ... */
+ ulint check_foreigns; /* normally TRUE, but if the user
wants to suppress foreign key checks,
(in table imports, for example) we
set this FALSE */
- unsigned check_unique_secondary:1;
+ ulint check_unique_secondary;
/* normally TRUE, but if the user
wants to speed up inserts by
suppressing unique key checks
for secondary indexes when we decide
if we can use the insert buffer for
them, we set this FALSE */
- unsigned support_xa:1; /*!< normally we do the XA two-phase
+ ulint support_xa; /*!< normally we do the XA two-phase
commit steps, but by setting this to
FALSE, one can save CPU time and about
150 bytes in the undo log size as then
we skip XA steps */
- unsigned flush_log_later:1;/* In 2PC, we hold the
+ ulint flush_log_later;/* In 2PC, we hold the
prepare_commit mutex across
both phases. In that case, we
defer flush of the logs to disk
until after we release the
mutex. */
- unsigned must_flush_log_later:1;/* this flag is set to TRUE in
+ ulint must_flush_log_later;/* this flag is set to TRUE in
trx_commit_off_kernel() if
flush_log_later was TRUE, and there
were modifications by the transaction;
in that case we must flush the log
in trx_commit_complete_for_mysql() */
- unsigned dict_operation:2;/**< @see enum trx_dict_op */
- unsigned duplicates:2; /*!< TRX_DUP_IGNORE | TRX_DUP_REPLACE */
- unsigned active_trans:2; /*!< 1 - if a transaction in MySQL
+ ulint duplicates; /*!< TRX_DUP_IGNORE | TRX_DUP_REPLACE */
+ ulint active_trans; /*!< 1 - if a transaction in MySQL
is active. 2 - if prepare_commit_mutex
was taken */
- unsigned has_search_latch:1;
+ ulint has_search_latch;
/* TRUE if this trx has latched the
search system latch in S-mode */
- unsigned declared_to_be_inside_innodb:1;
+ ulint deadlock_mark; /*!< a mark field used in deadlock
+ checking algorithm. */
+ trx_dict_op_t dict_operation; /**< @see enum trx_dict_op */
+
+ /* Fields protected by the srv_conc_mutex. */
+ ulint declared_to_be_inside_innodb;
/* this is TRUE if we have declared
this transaction in
srv_conc_enter_innodb to be inside the
InnoDB engine */
- unsigned handling_signals:1;/* this is TRUE as long as the trx
- is handling signals */
- unsigned dict_operation_lock_mode:2;
- /* 0, RW_S_LATCH, or RW_X_LATCH:
+
+ /* Fields protected by dict_operation_lock. The very latch
+ it is used to track. */
+ ulint dict_operation_lock_mode;
+ /*!< 0, RW_S_LATCH, or RW_X_LATCH:
the latch mode trx currently holds
on dict_operation_lock */
+
+ /* All the next fields are protected by the kernel mutex, except the
+ undo logs which are protected by undo_mutex */
+ ulint is_purge; /*!< 0=user transaction, 1=purge */
+ ulint is_recovered; /*!< 0=normal transaction,
+ 1=recovered, must be rolled back */
+ ulint que_state; /*!< valid when conc_state
+ == TRX_ACTIVE: TRX_QUE_RUNNING,
+ TRX_QUE_LOCK_WAIT, ... */
+ ulint handling_signals;/* this is TRUE as long as the trx
+ is handling signals */
time_t start_time; /*!< time the trx object was created
or the state last time became
TRX_ACTIVE */
@@ -657,11 +673,6 @@ struct trx_struct{
wait_thrs; /*!< query threads belonging to this
trx that are in the QUE_THR_LOCK_WAIT
state */
- ulint deadlock_mark; /*!< a mark field used in deadlock
- checking algorithm. This must be
- in its own machine word, because
- it can be changed by other
- threads while holding kernel_mutex. */
/*------------------------------*/
mem_heap_t* lock_heap; /*!< memory heap for the locks of the
transaction */
diff --git a/storage/xtradb/include/trx0types.h b/storage/xtradb/include/trx0types.h
index 24cf57d53d5..40a7256cbfd 100644
--- a/storage/xtradb/include/trx0types.h
+++ b/storage/xtradb/include/trx0types.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -70,6 +70,13 @@ typedef struct trx_named_savept_struct trx_named_savept_t;
enum trx_rb_ctx {
RB_NONE = 0, /*!< no rollback */
RB_NORMAL, /*!< normal rollback */
+ RB_RECOVERY_PURGE_REC,
+ /*!< rolling back an incomplete transaction,
+ in crash recovery, rolling back an
+ INSERT that was performed by updating a
+ delete-marked record; if the delete-marked record
+ no longer exists in an active read view, it will
+ be purged */
RB_RECOVERY /*!< rolling back an incomplete transaction,
in crash recovery */
};
diff --git a/storage/xtradb/include/univ.i b/storage/xtradb/include/univ.i
index 09b28ce72dd..834428e652f 100644
--- a/storage/xtradb/include/univ.i
+++ b/storage/xtradb/include/univ.i
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
Copyright (c) 2008, Google Inc.
Copyright (c) 2009, Sun Microsystems, Inc.
@@ -46,8 +46,8 @@ Created 1/20/1994 Heikki Tuuri
#define INNODB_VERSION_MAJOR 1
#define INNODB_VERSION_MINOR 0
-#define INNODB_VERSION_BUGFIX 6
-#define PERCONA_INNODB_VERSION 10
+#define INNODB_VERSION_BUGFIX 8
+#define PERCONA_INNODB_VERSION 11.2
/* The following is the InnoDB version as shown in
SELECT plugin_version FROM information_schema.plugins;
@@ -118,7 +118,7 @@ if we are compiling on Windows. */
/* Include <sys/stat.h> to get S_I... macros defined for os0file.c */
# include <sys/stat.h>
-# if !defined(__NETWARE__) && !defined(__WIN__)
+# if !defined(__NETWARE__) && !defined(__WIN__)
# include <sys/mman.h> /* mmap() for os0proc.c */
# endif
@@ -168,6 +168,9 @@ command. Not tested on Windows. */
#define UNIV_COMPILE_TEST_FUNCS
*/
+#ifdef HAVE_valgrind
+# define UNIV_DEBUG_VALGRIND
+#endif /* HAVE_valgrind */
#if 0
#define UNIV_DEBUG_VALGRIND /* Enable extra
Valgrind instrumentation */
@@ -205,6 +208,10 @@ operations (very slow); also UNIV_DEBUG must be defined */
adaptive hash index */
#define UNIV_SRV_PRINT_LATCH_WAITS /* enable diagnostic output
in sync0sync.c */
+#define UNIV_BTR_AVOID_COPY /* when splitting B-tree nodes,
+ do not move any records when
+ all the records would
+ be moved */
#define UNIV_BTR_PRINT /* enable functions for
printing B-trees */
#define UNIV_ZIP_DEBUG /* extensive consistency checks
@@ -232,11 +239,6 @@ by one. */
/* the above option prevents forcing of log to disk
at a buffer page write: it should be tested with this
option off; also some ibuf tests are suppressed */
-/*
-#define UNIV_BASIC_LOG_DEBUG
-*/
- /* the above option enables basic recovery debugging:
- new allocated file pages are reset */
/* Linkage specifier for non-static InnoDB symbols (variables and functions)
that are only referenced from within InnoDB, not from MySQL */
diff --git a/storage/xtradb/include/ut0rbt.h b/storage/xtradb/include/ut0rbt.h
new file mode 100644
index 00000000000..6fd050acfe7
--- /dev/null
+++ b/storage/xtradb/include/ut0rbt.h
@@ -0,0 +1,309 @@
+/*****************************************************************************
+Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/ut0rbt.h
+Red-Black tree implementation.
+
+Created 2007-03-20 Sunny Bains
+************************************************************************/
+
+#ifndef INNOBASE_UT0RBT_H
+#define INNOBASE_UT0RBT_H
+
+#if !defined(IB_RBT_TESTING)
+#include "univ.i"
+#include "ut0mem.h"
+#else
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#define ut_malloc malloc
+#define ut_free free
+#define ulint unsigned long
+#define ut_a(c) assert(c)
+#define ut_error assert(0)
+#define ibool unsigned int
+#define TRUE 1
+#define FALSE 0
+#endif
+
+/* Red black tree typedefs */
+typedef struct ib_rbt_struct ib_rbt_t;
+typedef struct ib_rbt_node_struct ib_rbt_node_t;
+/* FIXME: Iterator is a better name than _bound_ */
+typedef struct ib_rbt_bound_struct ib_rbt_bound_t;
+typedef void (*ib_rbt_print_node)(const ib_rbt_node_t* node);
+typedef int (*ib_rbt_compare)(const void* p1, const void* p2);
+
+/* Red black tree color types */
+enum ib_rbt_color_enum {
+ IB_RBT_RED,
+ IB_RBT_BLACK
+};
+
+typedef enum ib_rbt_color_enum ib_rbt_color_t;
+
+/* Red black tree node */
+struct ib_rbt_node_struct {
+ ib_rbt_color_t color; /* color of this node */
+
+ ib_rbt_node_t* left; /* points left child */
+ ib_rbt_node_t* right; /* points right child */
+ ib_rbt_node_t* parent; /* points parent node */
+
+ char value[1]; /* Data value */
+};
+
+/* Red black tree instance.*/
+struct ib_rbt_struct {
+ ib_rbt_node_t* nil; /* Black colored node that is
+ used as a sentinel. This is
+ pre-allocated too.*/
+
+ ib_rbt_node_t* root; /* Root of the tree, this is
+ pre-allocated and the first
+ data node is the left child.*/
+
+ ulint n_nodes; /* Total number of data nodes */
+
+ ib_rbt_compare compare; /* Fn. to use for comparison */
+ ulint sizeof_value; /* Sizeof the item in bytes */
+};
+
+/* The result of searching for a key in the tree, this is useful for
+a speedy lookup and insert if key doesn't exist.*/
+struct ib_rbt_bound_struct {
+ const ib_rbt_node_t*
+ last; /* Last node visited */
+
+ int result; /* Result of comparing with
+ the last non-nil node that
+ was visited */
+};
+
+/* Size in elements (t is an rb tree instance) */
+#define rbt_size(t) (t->n_nodes)
+
+/* Check whether the rb tree is empty (t is an rb tree instance) */
+#define rbt_empty(t) (rbt_size(t) == 0)
+
+/* Get data value (t is the data type, n is an rb tree node instance) */
+#define rbt_value(t, n) ((t*) &n->value[0])
+
+/* Compare a key with the node value (t is tree, k is key, n is node)*/
+#define rbt_compare(t, k, n) (t->compare(k, n->value))
+
+/****************************************************************//**
+Free an instance of a red black tree */
+UNIV_INTERN
+void
+rbt_free(
+/*=====*/
+ ib_rbt_t* tree); /*!< in: rb tree to free */
+/****************************************************************//**
+Create an instance of a red black tree
+@return rb tree instance */
+UNIV_INTERN
+ib_rbt_t*
+rbt_create(
+/*=======*/
+ size_t sizeof_value, /*!< in: size in bytes */
+ ib_rbt_compare compare); /*!< in: comparator */
+/****************************************************************//**
+Delete a node from the red black tree, identified by key.
+@return TRUE if success FALSE if not found */
+UNIV_INTERN
+ibool
+rbt_delete(
+/*=======*/
+ ib_rbt_t* tree, /*!< in: rb tree */
+ const void* key); /*!< in: key to delete */
+/****************************************************************//**
+Remove a node from the rb tree, the node is not free'd, that is the
+callers responsibility.
+@return the deleted node with the const. */
+UNIV_INTERN
+ib_rbt_node_t*
+rbt_remove_node(
+/*============*/
+ ib_rbt_t* tree, /*!< in: rb tree */
+ const ib_rbt_node_t*
+ node); /*!< in: node to delete, this
+ is a fudge and declared const
+ because the caller has access
+ only to const nodes.*/
+/****************************************************************//**
+Find a matching node in the rb tree.
+@return node if found else return NULL */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_lookup(
+/*=======*/
+ const ib_rbt_t* tree, /*!< in: rb tree to search */
+ const void* key); /*!< in: key to lookup */
+/****************************************************************//**
+Generic insert of a value in the rb tree.
+@return inserted node */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_insert(
+/*=======*/
+ ib_rbt_t* tree, /*!< in: rb tree */
+ const void* key, /*!< in: key for ordering */
+ const void* value); /*!< in: data that will be
+ copied to the node.*/
+/****************************************************************//**
+Add a new node to the tree, useful for data that is pre-sorted.
+@return appended node */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_add_node(
+/*=========*/
+ ib_rbt_t* tree, /*!< in: rb tree */
+ ib_rbt_bound_t* parent, /*!< in: parent */
+ const void* value); /*!< in: this value is copied
+ to the node */
+/****************************************************************//**
+Return the left most data node in the tree
+@return left most node */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_first(
+/*======*/
+ const ib_rbt_t* tree); /*!< in: rb tree */
+/****************************************************************//**
+Return the right most data node in the tree
+@return right most node */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_last(
+/*=====*/
+ const ib_rbt_t* tree); /*!< in: rb tree */
+/****************************************************************//**
+Return the next node from current.
+@return successor node to current that is passed in. */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_next(
+/*=====*/
+ const ib_rbt_t* tree, /*!< in: rb tree */
+ const ib_rbt_node_t* /*!< in: current node */
+ current);
+/****************************************************************//**
+Return the prev node from current.
+@return precedessor node to current that is passed in */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_prev(
+/*=====*/
+ const ib_rbt_t* tree, /*!< in: rb tree */
+ const ib_rbt_node_t* /*!< in: current node */
+ current);
+/****************************************************************//**
+Find the node that has the lowest key that is >= key.
+@return node that satisfies the lower bound constraint or NULL */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_lower_bound(
+/*============*/
+ const ib_rbt_t* tree, /*!< in: rb tree */
+ const void* key); /*!< in: key to search */
+/****************************************************************//**
+Find the node that has the greatest key that is <= key.
+@return node that satisifies the upper bound constraint or NULL */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_upper_bound(
+/*============*/
+ const ib_rbt_t* tree, /*!< in: rb tree */
+ const void* key); /*!< in: key to search */
+/****************************************************************//**
+Search for the key, a node will be retuned in parent.last, whether it
+was found or not. If not found then parent.last will contain the
+parent node for the possibly new key otherwise the matching node.
+@return result of last comparison */
+UNIV_INTERN
+int
+rbt_search(
+/*=======*/
+ const ib_rbt_t* tree, /*!< in: rb tree */
+ ib_rbt_bound_t* parent, /*!< in: search bounds */
+ const void* key); /*!< in: key to search */
+/****************************************************************//**
+Search for the key, a node will be retuned in parent.last, whether it
+was found or not. If not found then parent.last will contain the
+parent node for the possibly new key otherwise the matching node.
+@return result of last comparison */
+UNIV_INTERN
+int
+rbt_search_cmp(
+/*===========*/
+ const ib_rbt_t* tree, /*!< in: rb tree */
+ ib_rbt_bound_t* parent, /*!< in: search bounds */
+ const void* key, /*!< in: key to search */
+ ib_rbt_compare compare); /*!< in: comparator */
+/****************************************************************//**
+Clear the tree, deletes (and free's) all the nodes. */
+UNIV_INTERN
+void
+rbt_clear(
+/*======*/
+ ib_rbt_t* tree); /*!< in: rb tree */
+/****************************************************************//**
+Merge the node from dst into src. Return the number of nodes merged.
+@return no. of recs merged */
+UNIV_INTERN
+ulint
+rbt_merge_uniq(
+/*===========*/
+ ib_rbt_t* dst, /*!< in: dst rb tree */
+ const ib_rbt_t* src); /*!< in: src rb tree */
+/****************************************************************//**
+Merge the node from dst into src. Return the number of nodes merged.
+Delete the nodes from src after copying node to dst. As a side effect
+the duplicates will be left untouched in the src, since we don't support
+duplicates (yet). NOTE: src and dst must be similar, the function doesn't
+check for this condition (yet).
+@return no. of recs merged */
+UNIV_INTERN
+ulint
+rbt_merge_uniq_destructive(
+/*=======================*/
+ ib_rbt_t* dst, /*!< in: dst rb tree */
+ ib_rbt_t* src); /*!< in: src rb tree */
+/****************************************************************//**
+Verify the integrity of the RB tree. For debugging. 0 failure else height
+of tree (in count of black nodes).
+@return TRUE if OK FALSE if tree invalid. */
+UNIV_INTERN
+ibool
+rbt_validate(
+/*=========*/
+ const ib_rbt_t* tree); /*!< in: tree to validate */
+/****************************************************************//**
+Iterate over the tree in depth first order. */
+UNIV_INTERN
+void
+rbt_print(
+/*======*/
+ const ib_rbt_t* tree, /*!< in: tree to traverse */
+ ib_rbt_print_node print); /*!< in: print function */
+
+#endif /* INNOBASE_UT0RBT_H */
diff --git a/storage/xtradb/include/ut0rnd.ic b/storage/xtradb/include/ut0rnd.ic
index 372b5b6d5b7..c2043660efd 100644
--- a/storage/xtradb/include/ut0rnd.ic
+++ b/storage/xtradb/include/ut0rnd.ic
@@ -152,6 +152,7 @@ ut_hash_ulint(
ulint key, /*!< in: value to be hashed */
ulint table_size) /*!< in: hash table size */
{
+ ut_ad(table_size);
key = key ^ UT_HASH_RANDOM_MASK2;
return(key % table_size);
diff --git a/storage/xtradb/lock/lock0lock.c b/storage/xtradb/lock/lock0lock.c
index 59394f13766..b103ee79578 100644
--- a/storage/xtradb/lock/lock0lock.c
+++ b/storage/xtradb/lock/lock0lock.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -376,6 +376,7 @@ UNIV_INTERN FILE* lock_latest_err_file;
/* Flags for recursive deadlock search */
#define LOCK_VICTIM_IS_START 1
#define LOCK_VICTIM_IS_OTHER 2
+#define LOCK_EXCEED_MAX_DEPTH 3
/********************************************************************//**
Checks if a lock request results in a deadlock.
@@ -394,24 +395,25 @@ Looks recursively for a deadlock.
deadlock and we chose 'start' as the victim, LOCK_VICTIM_IS_OTHER if a
deadlock was found and we chose some other trx as a victim: we must do
the search again in this last case because there may be another
-deadlock! */
+deadlock!
+LOCK_EXCEED_MAX_DEPTH if the lock search exceeds max steps or max depth. */
static
ulint
lock_deadlock_recursive(
/*====================*/
trx_t* start, /*!< in: recursion starting point */
trx_t* trx, /*!< in: a transaction waiting for a lock */
- lock_t* wait_lock, /*!< in: the lock trx is waiting to be granted */
+ lock_t* wait_lock, /*!< in: lock that is waiting to be granted */
ulint* cost, /*!< in/out: number of calculation steps thus
far: if this exceeds LOCK_MAX_N_STEPS_...
- we return LOCK_VICTIM_IS_START */
+ we return LOCK_EXCEED_MAX_DEPTH */
ulint depth); /*!< in: recursion depth: if this exceeds
LOCK_MAX_DEPTH_IN_DEADLOCK_CHECK, we
- return LOCK_VICTIM_IS_START */
+ return LOCK_EXCEED_MAX_DEPTH */
/*********************************************************************//**
Gets the nth bit of a record lock.
-@return TRUE if bit set */
+@return TRUE if bit set also if i == ULINT_UNDEFINED return FALSE*/
UNIV_INLINE
ibool
lock_rec_get_nth_bit(
@@ -1222,7 +1224,7 @@ lock_rec_get_first_on_page(
/*********************************************************************//**
Gets the next explicit lock request on a record.
-@return next lock, NULL if none exists */
+@return next lock, NULL if none exists or if heap_no == ULINT_UNDEFINED */
UNIV_INLINE
lock_t*
lock_rec_get_next(
@@ -2404,7 +2406,7 @@ lock_rec_inherit_to_gap(
if (!lock_rec_get_insert_intention(lock)
&& !((srv_locks_unsafe_for_binlog
|| lock->trx->isolation_level
- == TRX_ISO_READ_COMMITTED)
+ <= TRX_ISO_READ_COMMITTED)
&& lock_get_mode(lock) == LOCK_X)) {
lock_rec_add_to_queue(LOCK_REC | LOCK_GAP
@@ -3267,8 +3269,6 @@ lock_deadlock_occurs(
lock_t* lock, /*!< in: lock the transaction is requesting */
trx_t* trx) /*!< in: transaction */
{
- dict_table_t* table;
- dict_index_t* index;
trx_t* mark_trx;
ulint ret;
ulint cost = 0;
@@ -3290,31 +3290,51 @@ retry:
ret = lock_deadlock_recursive(trx, trx, lock, &cost, 0);
- if (ret == LOCK_VICTIM_IS_OTHER) {
+ switch (ret) {
+ case LOCK_VICTIM_IS_OTHER:
/* We chose some other trx as a victim: retry if there still
is a deadlock */
-
goto retry;
- }
- if (UNIV_UNLIKELY(ret == LOCK_VICTIM_IS_START)) {
- if (lock_get_type_low(lock) & LOCK_TABLE) {
- table = lock->un_member.tab_lock.table;
- index = NULL;
+ case LOCK_EXCEED_MAX_DEPTH:
+ /* If the lock search exceeds the max step
+ or the max depth, the current trx will be
+ the victim. Print its information. */
+ rewind(lock_latest_err_file);
+ ut_print_timestamp(lock_latest_err_file);
+
+ fputs("TOO DEEP OR LONG SEARCH IN THE LOCK TABLE"
+ " WAITS-FOR GRAPH, WE WILL ROLL BACK"
+ " FOLLOWING TRANSACTION \n",
+ lock_latest_err_file);
+
+ fputs("\n*** TRANSACTION:\n", lock_latest_err_file);
+ trx_print(lock_latest_err_file, trx, 3000);
+
+ fputs("*** WAITING FOR THIS LOCK TO BE GRANTED:\n",
+ lock_latest_err_file);
+
+ if (lock_get_type(lock) == LOCK_REC) {
+ lock_rec_print(lock_latest_err_file, lock);
} else {
- index = lock->index;
- table = index->table;
+ lock_table_print(lock_latest_err_file, lock);
}
+ break;
- lock_deadlock_found = TRUE;
-
+ case LOCK_VICTIM_IS_START:
+ srv_n_lock_deadlock_count++;
fputs("*** WE ROLL BACK TRANSACTION (2)\n",
lock_latest_err_file);
+ break;
- return(TRUE);
+ default:
+ /* No deadlock detected*/
+ return(FALSE);
}
- return(FALSE);
+ lock_deadlock_found = TRUE;
+
+ return(TRUE);
}
/********************************************************************//**
@@ -3323,25 +3343,26 @@ Looks recursively for a deadlock.
deadlock and we chose 'start' as the victim, LOCK_VICTIM_IS_OTHER if a
deadlock was found and we chose some other trx as a victim: we must do
the search again in this last case because there may be another
-deadlock! */
+deadlock!
+LOCK_EXCEED_MAX_DEPTH if the lock search exceeds max steps or max depth. */
static
ulint
lock_deadlock_recursive(
/*====================*/
trx_t* start, /*!< in: recursion starting point */
trx_t* trx, /*!< in: a transaction waiting for a lock */
- lock_t* wait_lock, /*!< in: the lock trx is waiting to be granted */
+ lock_t* wait_lock, /*!< in: lock that is waiting to be granted */
ulint* cost, /*!< in/out: number of calculation steps thus
far: if this exceeds LOCK_MAX_N_STEPS_...
- we return LOCK_VICTIM_IS_START */
+ we return LOCK_EXCEED_MAX_DEPTH */
ulint depth) /*!< in: recursion depth: if this exceeds
LOCK_MAX_DEPTH_IN_DEADLOCK_CHECK, we
- return LOCK_VICTIM_IS_START */
+ return LOCK_EXCEED_MAX_DEPTH */
{
+ ulint ret;
lock_t* lock;
- ulint bit_no = ULINT_UNDEFINED;
trx_t* lock_trx;
- ulint ret;
+ ulint heap_no = ULINT_UNDEFINED;
ut_a(trx);
ut_a(start);
@@ -3357,27 +3378,44 @@ lock_deadlock_recursive(
*cost = *cost + 1;
- lock = wait_lock;
-
if (lock_get_type_low(wait_lock) == LOCK_REC) {
+ ulint space;
+ ulint page_no;
+
+ heap_no = lock_rec_find_set_bit(wait_lock);
+ ut_a(heap_no != ULINT_UNDEFINED);
+
+ space = wait_lock->un_member.rec_lock.space;
+ page_no = wait_lock->un_member.rec_lock.page_no;
+
+ lock = lock_rec_get_first_on_page_addr(space, page_no);
+
+ /* Position the iterator on the first matching record lock. */
+ while (lock != NULL
+ && lock != wait_lock
+ && !lock_rec_get_nth_bit(lock, heap_no)) {
+
+ lock = lock_rec_get_next_on_page(lock);
+ }
+
+ if (lock == wait_lock) {
+ lock = NULL;
+ }
- bit_no = lock_rec_find_set_bit(wait_lock);
+ ut_ad(lock == NULL || lock_rec_get_nth_bit(lock, heap_no));
- ut_a(bit_no != ULINT_UNDEFINED);
+ } else {
+ lock = wait_lock;
}
/* Look at the locks ahead of wait_lock in the lock queue */
for (;;) {
- if (lock_get_type_low(lock) & LOCK_TABLE) {
-
- lock = UT_LIST_GET_PREV(un_member.tab_lock.locks,
- lock);
- } else {
- ut_ad(lock_get_type_low(lock) == LOCK_REC);
- ut_a(bit_no != ULINT_UNDEFINED);
+ /* Get previous table lock. */
+ if (heap_no == ULINT_UNDEFINED) {
- lock = (lock_t*) lock_rec_get_prev(lock, bit_no);
+ lock = UT_LIST_GET_PREV(
+ un_member.tab_lock.locks, lock);
}
if (lock == NULL) {
@@ -3395,7 +3433,7 @@ lock_deadlock_recursive(
lock_trx = lock->trx;
- if (lock_trx == start || too_far) {
+ if (lock_trx == start) {
/* We came back to the recursion starting
point: a deadlock detected; or we have
@@ -3442,19 +3480,10 @@ lock_deadlock_recursive(
}
#ifdef UNIV_DEBUG
if (lock_print_waits) {
- fputs("Deadlock detected"
- " or too long search\n",
+ fputs("Deadlock detected\n",
stderr);
}
#endif /* UNIV_DEBUG */
- if (too_far) {
-
- fputs("TOO DEEP OR LONG SEARCH"
- " IN THE LOCK TABLE"
- " WAITS-FOR GRAPH\n", ef);
-
- return(LOCK_VICTIM_IS_START);
- }
if (trx_weight_cmp(wait_lock->trx,
start) >= 0) {
@@ -3490,6 +3519,21 @@ lock_deadlock_recursive(
return(LOCK_VICTIM_IS_OTHER);
}
+ if (too_far) {
+
+#ifdef UNIV_DEBUG
+ if (lock_print_waits) {
+ fputs("Deadlock search exceeds"
+ " max steps or depth.\n",
+ stderr);
+ }
+#endif /* UNIV_DEBUG */
+ /* The information about transaction/lock
+ to be rolled back is available in the top
+ level. Do not print anything here. */
+ return(LOCK_EXCEED_MAX_DEPTH);
+ }
+
if (lock_trx->que_state == TRX_QUE_LOCK_WAIT) {
/* Another trx ahead has requested lock in an
@@ -3499,12 +3543,28 @@ lock_deadlock_recursive(
ret = lock_deadlock_recursive(
start, lock_trx,
lock_trx->wait_lock, cost, depth + 1);
+
if (ret != 0) {
return(ret);
}
}
}
+ /* Get the next record lock to check. */
+ if (heap_no != ULINT_UNDEFINED) {
+
+ ut_a(lock != NULL);
+
+ do {
+ lock = lock_rec_get_next_on_page(lock);
+ } while (lock != NULL
+ && lock != wait_lock
+ && !lock_rec_get_nth_bit(lock, heap_no));
+
+ if (lock == wait_lock) {
+ lock = NULL;
+ }
+ }
}/* end of the 'for (;;)'-loop */
}
@@ -3706,9 +3766,10 @@ lock_table_enqueue_waiting(
/*********************************************************************//**
Checks if other transactions have an incompatible mode lock request in
-the lock queue. */
+the lock queue.
+@return lock or NULL */
UNIV_INLINE
-ibool
+lock_t*
lock_table_other_has_incompatible(
/*==============================*/
trx_t* trx, /*!< in: transaction, or NULL if all
@@ -3730,13 +3791,13 @@ lock_table_other_has_incompatible(
&& (!lock_mode_compatible(lock_get_mode(lock), mode))
&& (wait || !(lock_get_wait(lock)))) {
- return(TRUE);
+ return(lock);
}
lock = UT_LIST_GET_PREV(un_member.tab_lock.locks, lock);
}
- return(FALSE);
+ return(NULL);
}
/*********************************************************************//**
@@ -4260,31 +4321,34 @@ lock_rec_print(
putc('\n', file);
if ( srv_show_verbose_locks ) {
- block = buf_page_try_get(space, page_no, &mtr);
+ block = buf_page_try_get(space, page_no, &mtr);
+
+ for (i = 0; i < lock_rec_get_n_bits(lock); ++i) {
+
+ if (!lock_rec_get_nth_bit(lock, i)) {
+ continue;
+ }
+
+ fprintf(file, "Record lock, heap no %lu", (ulong) i);
+
if (block) {
- for (i = 0; i < lock_rec_get_n_bits(lock); i++) {
+ const rec_t* rec;
- if (lock_rec_get_nth_bit(lock, i)) {
+ rec = page_find_rec_with_heap_no(
+ buf_block_get_frame(block), i);
- const rec_t* rec
- = page_find_rec_with_heap_no(
- buf_block_get_frame(block), i);
- offsets = rec_get_offsets(
- rec, lock->index, offsets,
- ULINT_UNDEFINED, &heap);
+ offsets = rec_get_offsets(
+ rec, lock->index, offsets,
+ ULINT_UNDEFINED, &heap);
- fprintf(file, "Record lock, heap no %lu ",
- (ulong) i);
- rec_print_new(file, rec, offsets);
- putc('\n', file);
- }
- }
- } else {
- for (i = 0; i < lock_rec_get_n_bits(lock); i++) {
- fprintf(file, "Record lock, heap no %lu\n", (ulong) i);
- }
+ putc(' ', file);
+ rec_print_new(file, rec, offsets);
}
+
+ putc('\n', file);
+ }
}
+
mtr_commit(&mtr);
if (UNIV_LIKELY_NULL(heap)) {
mem_heap_free(heap);
@@ -4329,14 +4393,26 @@ lock_get_n_rec_locks(void)
#endif /* PRINT_NUM_OF_LOCK_STRUCTS */
/*********************************************************************//**
-Prints info of locks for all transactions. */
+Prints info of locks for all transactions.
+@return FALSE if not able to obtain kernel mutex
+and exits without printing info */
UNIV_INTERN
-void
+ibool
lock_print_info_summary(
/*====================*/
- FILE* file) /*!< in: file where to print */
+ FILE* file, /*!< in: file where to print */
+ ibool nowait) /*!< in: whether to wait for the kernel mutex */
{
- lock_mutex_enter_kernel();
+ /* if nowait is FALSE, wait on the kernel mutex,
+ otherwise return immediately if fail to obtain the
+ mutex. */
+ if (!nowait) {
+ lock_mutex_enter_kernel();
+ } else if (mutex_enter_nowait(&kernel_mutex)) {
+ fputs("FAIL TO OBTAIN KERNEL MUTEX, "
+ "SKIP LOCK INFO PRINTING\n", file);
+ return(FALSE);
+ }
if (lock_deadlock_found) {
fputs("------------------------\n"
@@ -4368,6 +4444,7 @@ lock_print_info_summary(
"Total number of lock structs in row lock hash table %lu\n",
(ulong) lock_get_n_rec_locks());
#endif /* PRINT_NUM_OF_LOCK_STRUCTS */
+ return(TRUE);
}
/*********************************************************************//**
@@ -4648,6 +4725,7 @@ lock_rec_queue_validate(
ut_a(lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP,
block, heap_no, impl_trx));
}
+#if 0
} else {
/* The kernel mutex may get released temporarily in the
@@ -4658,6 +4736,27 @@ lock_rec_queue_validate(
(fil_space_t::latch), the following check WILL break
latching order and may cause a deadlock of threads. */
+ /* NOTE: This is a bogus check that would fail in the
+ following case: Our transaction is updating a
+ row. After it has updated the clustered index record,
+ it goes to a secondary index record and finds someone
+ else holding an explicit S- or X-lock on that
+ secondary index record, presumably from a locking
+ read. Our transaction cannot update the secondary
+ index immediately, but places a waiting X-lock request
+ on the secondary index record. There is nothing
+ illegal in this. The assertion is simply too strong. */
+
+ /* From the locking point of view, each secondary
+ index is a separate table. A lock that is held on
+ secondary index rec does not give any rights to modify
+ or read the clustered index rec. Therefore, we can
+ think of the sec index as a separate 'table' from the
+ clust index 'table'. Conversely, a transaction that
+ has acquired a lock on and modified a clustered index
+ record may need to wait for a lock on the
+ corresponding record in a secondary index. */
+
impl_trx = lock_sec_rec_some_has_impl_off_kernel(
rec, index, offsets);
@@ -4668,6 +4767,7 @@ lock_rec_queue_validate(
ut_a(lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP,
block, heap_no, impl_trx));
}
+#endif
}
lock = lock_rec_get_first(block, heap_no);
@@ -4765,6 +4865,13 @@ loop:
|| lock->trx->conc_state == TRX_PREPARED
|| lock->trx->conc_state == TRX_COMMITTED_IN_MEMORY);
+# ifdef UNIV_SYNC_DEBUG
+ /* Only validate the record queues when this thread is not
+ holding a space->latch. Deadlocks are possible due to
+ latching order violation when UNIV_DEBUG is defined while
+ UNIV_SYNC_DEBUG is not. */
+ if (!sync_thread_levels_contains(SYNC_FSP))
+# endif /* UNIV_SYNC_DEBUG */
for (i = nth_bit; i < lock_rec_get_n_bits(lock); i++) {
if (i == 1 || lock_rec_get_nth_bit(lock, i)) {
@@ -4930,7 +5037,7 @@ lock_rec_insert_check_and_lock(
}
trx = thr_get_trx(thr);
- next_rec = page_rec_get_next((rec_t*) rec);
+ next_rec = page_rec_get_next_const(rec);
next_rec_heap_no = page_rec_get_heap_no(next_rec);
lock_mutex_enter_kernel();
diff --git a/storage/xtradb/log/log0log.c b/storage/xtradb/log/log0log.c
index f2487407071..03d097d1c12 100644
--- a/storage/xtradb/log/log0log.c
+++ b/storage/xtradb/log/log0log.c
@@ -1,23 +1,6 @@
/*****************************************************************************
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
-
-This program is free software; you can redistribute it and/or modify it under
-the terms of the GNU General Public License as published by the Free Software
-Foundation; version 2 of the License.
-
-This program is distributed in the hope that it will be useful, but WITHOUT
-ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
-
-*****************************************************************************/
-/*****************************************************************************
-
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
Copyright (c) 2009, Google Inc.
Portions of this file contain modifications contributed and copyrighted by
@@ -2045,7 +2028,7 @@ log_checkpoint(
return(TRUE);
}
- ut_ad(log_sys->written_to_all_lsn >= oldest_lsn);
+ ut_ad(log_sys->flushed_to_disk_lsn >= oldest_lsn);
if (log_sys->n_pending_checkpoint_writes > 0) {
/* A checkpoint write is running */
@@ -3127,7 +3110,7 @@ loop:
if (srv_fast_shutdown < 2
&& (srv_error_monitor_active
- || srv_lock_timeout_and_monitor_active)) {
+ || srv_lock_timeout_active || srv_monitor_active)) {
mutex_exit(&kernel_mutex);
diff --git a/storage/xtradb/log/log0recv.c b/storage/xtradb/log/log0recv.c
index 25eee07067c..1c9b4960ee4 100644
--- a/storage/xtradb/log/log0recv.c
+++ b/storage/xtradb/log/log0recv.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -138,7 +138,9 @@ UNIV_INTERN ulint recv_max_parsed_page_no;
/** This many frames must be left free in the buffer pool when we scan
the log and store the scanned log records in the buffer pool: we will
use these free frames to read in pages when we start applying the
-log records to the database. */
+log records to the database.
+This is the default value. If the actual size of the buffer pool is
+larger than 10 MB we'll set this value to 512. */
UNIV_INTERN ulint recv_n_pool_free_frames;
/** The maximum lsn we see for a page during the recovery process. If this
@@ -242,6 +244,7 @@ recv_sys_mem_free(void)
}
}
+#ifndef UNIV_HOTBACKUP
/************************************************************
Reset the state of the recovery system variables. */
UNIV_INTERN
@@ -251,7 +254,7 @@ recv_sys_var_init(void)
{
recv_lsn_checks_on = FALSE;
- recv_n_pool_free_frames = 1024;
+ recv_n_pool_free_frames = 256;
recv_recovery_on = FALSE;
@@ -277,10 +280,11 @@ recv_sys_var_init(void)
recv_max_parsed_page_no = 0;
- recv_n_pool_free_frames = 1024;
+ recv_n_pool_free_frames = 256;
recv_max_page_lsn = 0;
}
+#endif /* !UNIV_HOTBACKUP */
/************************************************************
Inits the recovery system for a recovery operation. */
@@ -295,20 +299,37 @@ recv_sys_init(
return;
}
+ /* Initialize red-black tree for fast insertions into the
+ flush_list during recovery process.
+ As this initialization is done while holding the buffer pool
+ mutex we perform it before acquiring recv_sys->mutex. */
+#ifndef UNIV_HOTBACKUP
+ buf_flush_init_flush_rbt();
+
mutex_enter(&(recv_sys->mutex));
-#ifndef UNIV_HOTBACKUP
recv_sys->heap = mem_heap_create_in_buffer(256);
#else /* !UNIV_HOTBACKUP */
recv_sys->heap = mem_heap_create(256);
recv_is_from_backup = TRUE;
#endif /* !UNIV_HOTBACKUP */
+ /* Set appropriate value of recv_n_pool_free_frames. */
+ if (buf_pool_get_curr_size() >= (10 * 1024 * 1024)) {
+ /* Buffer pool of size greater than 10 MB. */
+ recv_n_pool_free_frames = 512;
+ }
+
+ if (buf_pool_get_curr_size() >= (32 * 1024 * 1024)) {
+ /* Buffer pool of size greater than 32 MB. */
+ recv_n_pool_free_frames = 1024;
+ }
+
recv_sys->buf = ut_malloc(RECV_PARSING_BUF_SIZE);
recv_sys->len = 0;
recv_sys->recovered_offset = 0;
- recv_sys->addr_hash = hash_create(available_memory / 64);
+ recv_sys->addr_hash = hash_create(available_memory / 512);
recv_sys->n_addrs = 0;
recv_sys->apply_log_recs = FALSE;
@@ -348,7 +369,7 @@ recv_sys_empty_hash(void)
hash_table_free(recv_sys->addr_hash);
mem_heap_empty(recv_sys->heap);
- recv_sys->addr_hash = hash_create(buf_pool_get_curr_size() / 256);
+ recv_sys->addr_hash = hash_create(buf_pool_get_curr_size() / 512);
}
#ifndef UNIV_HOTBACKUP
@@ -373,6 +394,9 @@ recv_sys_debug_free(void)
recv_sys->last_block_buf_start = NULL;
mutex_exit(&(recv_sys->mutex));
+
+ /* Free up the flush_rbt. */
+ buf_flush_free_flush_rbt();
}
# endif /* UNIV_LOG_DEBUG */
@@ -2117,15 +2141,6 @@ recv_parse_log_rec(
}
#endif /* UNIV_LOG_LSN_DEBUG */
- /* Check that page_no is sensible */
-
- if (UNIV_UNLIKELY(*page_no > 0x8FFFFFFFUL)) {
-
- recv_sys->found_corrupt_log = TRUE;
-
- return(0);
- }
-
new_ptr = recv_parse_or_apply_log_rec_body(*type, new_ptr, end_ptr,
NULL, NULL);
if (UNIV_UNLIKELY(new_ptr == NULL)) {
@@ -2234,6 +2249,14 @@ recv_report_corrupt_log(
putc('\n', stderr);
}
+#ifndef UNIV_HOTBACKUP
+ if (!srv_force_recovery) {
+ fputs("InnoDB: Set innodb_force_recovery"
+ " to ignore this error.\n", stderr);
+ ut_error;
+ }
+#endif /* !UNIV_HOTBACKUP */
+
fputs("InnoDB: WARNING: the log file may have been corrupt and it\n"
"InnoDB: is possible that the log scan did not proceed\n"
"InnoDB: far enough in recovery! Please run CHECK TABLE\n"
@@ -2623,7 +2646,7 @@ recv_scan_log_recs(
ut_ad(start_lsn % OS_FILE_LOG_BLOCK_SIZE == 0);
ut_ad(len % OS_FILE_LOG_BLOCK_SIZE == 0);
- ut_ad(len > 0);
+ ut_ad(len >= OS_FILE_LOG_BLOCK_SIZE);
ut_a(store_to_hash <= TRUE);
finished = FALSE;
@@ -2748,6 +2771,16 @@ recv_scan_log_recs(
recv_sys->found_corrupt_log = TRUE;
+#ifndef UNIV_HOTBACKUP
+ if (!srv_force_recovery) {
+ fputs("InnoDB: Set"
+ " innodb_force_recovery"
+ " to ignore this error.\n",
+ stderr);
+ ut_error;
+ }
+#endif /* !UNIV_HOTBACKUP */
+
} else if (!recv_sys->found_corrupt_log) {
more_data = recv_sys_add_to_parsing_buf(
log_block, scanned_lsn);
@@ -3277,8 +3310,6 @@ void
recv_recovery_from_checkpoint_finish(void)
/*======================================*/
{
- int i;
-
/* Apply the hashed log records to the respective file pages */
if (srv_force_recovery < SRV_FORCE_NO_LOG_REDO) {
@@ -3403,9 +3434,16 @@ recv_recovery_from_checkpoint_finish(void)
The data dictionary latch should guarantee that there is at
most one data dictionary transaction active at a time. */
trx_rollback_or_clean_recovered(FALSE);
+}
- /* Drop partially created indexes. */
- row_merge_drop_temp_indexes();
+/********************************************************//**
+Initiates the rollback of active transactions. */
+UNIV_INTERN
+void
+recv_recovery_rollback_active(void)
+/*===============================*/
+{
+ int i;
#ifdef UNIV_SYNC_DEBUG
/* Wait for a while so that created threads have time to suspend
@@ -3415,6 +3453,11 @@ recv_recovery_from_checkpoint_finish(void)
/* Switch latching order checks on in sync0sync.c */
sync_order_checks_on = TRUE;
#endif
+ /* Drop partially created indexes. */
+ row_merge_drop_temp_indexes();
+ /* Drop temporary tables. */
+ row_mysql_drop_temp_tables();
+
if (srv_force_recovery < SRV_FORCE_NO_TRX_UNDO) {
/* Rollback the uncommitted transactions which have no user
session */
diff --git a/storage/xtradb/mem/mem0dbg.c b/storage/xtradb/mem/mem0dbg.c
index 01eda20ec45..1cd2ff15bab 100644
--- a/storage/xtradb/mem/mem0dbg.c
+++ b/storage/xtradb/mem/mem0dbg.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -180,6 +180,10 @@ mem_close(void)
{
mem_pool_free(mem_comm_pool);
mem_comm_pool = NULL;
+#ifdef UNIV_MEM_DEBUG
+ mutex_free(&mem_hash_mutex);
+ mem_hash_initialized = FALSE;
+#endif /* UNIV_MEM_DEBUG */
}
#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/mem/mem0mem.c b/storage/xtradb/mem/mem0mem.c
index ccb2fd8a7b4..c0ce8a3e1ac 100644
--- a/storage/xtradb/mem/mem0mem.c
+++ b/storage/xtradb/mem/mem0mem.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -383,6 +383,20 @@ mem_heap_create_block(
mem_block_set_free(block, MEM_BLOCK_HEADER_SIZE);
mem_block_set_start(block, MEM_BLOCK_HEADER_SIZE);
+ if (UNIV_UNLIKELY(heap == NULL)) {
+ /* This is the first block of the heap. The field
+ total_size should be initialized here */
+ block->total_size = len;
+ } else {
+ /* Not the first allocation for the heap. This block's
+ total_length field should be set to undefined. */
+ ut_d(block->total_size = ULINT_UNDEFINED);
+ UNIV_MEM_INVALID(&block->total_size,
+ sizeof block->total_size);
+
+ heap->total_size += len;
+ }
+
ut_ad((ulint)MEM_BLOCK_HEADER_SIZE < len);
return(block);
@@ -471,6 +485,10 @@ mem_heap_block_free(
mem_pool_mutex_exit();
#endif
+
+ ut_ad(heap->total_size >= block->len);
+ heap->total_size -= block->len;
+
type = heap->type;
len = block->len;
block->magic_n = MEM_FREED_BLOCK_MAGIC_N;
diff --git a/storage/xtradb/os/os0file.c b/storage/xtradb/os/os0file.c
index d3b3edea29f..c79a41626c9 100644
--- a/storage/xtradb/os/os0file.c
+++ b/storage/xtradb/os/os0file.c
@@ -1,23 +1,6 @@
-/*****************************************************************************
-
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
-
-This program is free software; you can redistribute it and/or modify it under
-the terms of the GNU General Public License as published by the Free Software
-Foundation; version 2 of the License.
-
-This program is distributed in the hope that it will be useful, but WITHOUT
-ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
-
-*****************************************************************************/
/***********************************************************************
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
Copyright (c) 2009, Percona Inc.
Portions of this file contain modifications contributed and copyrighted
@@ -846,7 +829,15 @@ next_file:
#ifdef HAVE_READDIR_R
ret = readdir_r(dir, (struct dirent*)dirent_buf, &ent);
- if (ret != 0) {
+ if (ret != 0
+#ifdef UNIV_AIX
+ /* On AIX, only if we got non-NULL 'ent' (result) value and
+ a non-zero 'ret' (return) value, it indicates a failed
+ readdir_r() call. An NULL 'ent' with an non-zero 'ret'
+ would indicate the "end of the directory" is reached. */
+ && ent != NULL
+#endif
+ ) {
fprintf(stderr,
"InnoDB: cannot read directory %s, error %lu\n",
dirname, (ulong)ret);
@@ -4030,6 +4021,9 @@ os_aio_simulated_handle(
ulint i;
time_t now;
+ /* Fix compiler warning */
+ *consecutive_ios = NULL;
+
segment = os_aio_get_array_and_local_segment(&array, global_segment);
restart:
diff --git a/storage/xtradb/page/page0page.c b/storage/xtradb/page/page0page.c
index ab2ba60570e..10008f9ac25 100644
--- a/storage/xtradb/page/page0page.c
+++ b/storage/xtradb/page/page0page.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -658,6 +658,14 @@ page_copy_rec_list_end(
index, mtr);
}
+ /* Update PAGE_MAX_TRX_ID on the uncompressed page.
+ Modifications will be redo logged and copied to the compressed
+ page in page_zip_compress() or page_zip_reorganize() below. */
+ if (dict_index_is_sec_or_ibuf(index) && page_is_leaf(page)) {
+ page_update_max_trx_id(new_block, NULL,
+ page_get_max_trx_id(page), mtr);
+ }
+
if (UNIV_LIKELY_NULL(new_page_zip)) {
mtr_set_log_mode(mtr, log_mode);
@@ -696,15 +704,10 @@ page_copy_rec_list_end(
}
}
- /* Update the lock table, MAX_TRX_ID, and possible hash index */
+ /* Update the lock table and possible hash index */
lock_move_rec_list_end(new_block, block, rec);
- if (dict_index_is_sec_or_ibuf(index) && page_is_leaf(page)) {
- page_update_max_trx_id(new_block, new_page_zip,
- page_get_max_trx_id(page), mtr);
- }
-
btr_search_move_or_delete_hash_entries(new_block, block, index);
return(ret);
@@ -772,6 +775,16 @@ page_copy_rec_list_start(
mem_heap_free(heap);
}
+ /* Update PAGE_MAX_TRX_ID on the uncompressed page.
+ Modifications will be redo logged and copied to the compressed
+ page in page_zip_compress() or page_zip_reorganize() below. */
+ if (dict_index_is_sec_or_ibuf(index)
+ && page_is_leaf(page_align(rec))) {
+ page_update_max_trx_id(new_block, NULL,
+ page_get_max_trx_id(page_align(rec)),
+ mtr);
+ }
+
if (UNIV_LIKELY_NULL(new_page_zip)) {
mtr_set_log_mode(mtr, log_mode);
@@ -809,14 +822,7 @@ page_copy_rec_list_start(
}
}
- /* Update MAX_TRX_ID, the lock table, and possible hash index */
-
- if (dict_index_is_sec_or_ibuf(index)
- && page_is_leaf(page_align(rec))) {
- page_update_max_trx_id(new_block, new_page_zip,
- page_get_max_trx_id(page_align(rec)),
- mtr);
- }
+ /* Update the lock table and possible hash index */
lock_move_rec_list_start(new_block, block, rec, ret);
@@ -2408,8 +2414,13 @@ page_validate(
}
offs = page_offset(rec_get_start(rec, offsets));
+ i = rec_offs_size(offsets);
+ if (UNIV_UNLIKELY(offs + i >= UNIV_PAGE_SIZE)) {
+ fputs("InnoDB: record offset out of bounds\n", stderr);
+ goto func_exit;
+ }
- for (i = rec_offs_size(offsets); i--; ) {
+ while (i--) {
if (UNIV_UNLIKELY(buf[offs + i])) {
/* No other record may overlap this */
@@ -2517,8 +2528,13 @@ n_owned_zero:
count++;
offs = page_offset(rec_get_start(rec, offsets));
+ i = rec_offs_size(offsets);
+ if (UNIV_UNLIKELY(offs + i >= UNIV_PAGE_SIZE)) {
+ fputs("InnoDB: record offset out of bounds\n", stderr);
+ goto func_exit;
+ }
- for (i = rec_offs_size(offsets); i--; ) {
+ while (i--) {
if (UNIV_UNLIKELY(buf[offs + i])) {
fputs("InnoDB: Record overlaps another"
diff --git a/storage/xtradb/page/page0zip.c b/storage/xtradb/page/page0zip.c
index c5c373781a2..7ef44f3246f 100644
--- a/storage/xtradb/page/page0zip.c
+++ b/storage/xtradb/page/page0zip.c
@@ -3117,8 +3117,13 @@ page_zip_validate_low(
temp_page_zip in a debugger when running valgrind --db-attach. */
VALGRIND_GET_VBITS(page, temp_page, UNIV_PAGE_SIZE);
UNIV_MEM_ASSERT_RW(page, UNIV_PAGE_SIZE);
+# if UNIV_WORD_SIZE == 4
VALGRIND_GET_VBITS(page_zip, &temp_page_zip, sizeof temp_page_zip);
+ /* On 32-bit systems, there is no padding in page_zip_des_t.
+ On other systems, Valgrind could complain about uninitialized
+ pad bytes. */
UNIV_MEM_ASSERT_RW(page_zip, sizeof *page_zip);
+# endif
VALGRIND_GET_VBITS(page_zip->data, temp_page,
page_zip_get_size(page_zip));
UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip));
diff --git a/storage/xtradb/plug.in b/storage/xtradb/plug.in
index c77bd15be1b..37c895fb520 100644
--- a/storage/xtradb/plug.in
+++ b/storage/xtradb/plug.in
@@ -1,5 +1,5 @@
#
-# Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+# Copyright (c) 2006, 2010, Innobase Oy. All Rights Reserved.
#
# This program is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
diff --git a/storage/xtradb/rem/rem0rec.c b/storage/xtradb/rem/rem0rec.c
index 1c8b3fd8c1e..37ba8ca2ffe 100644
--- a/storage/xtradb/rem/rem0rec.c
+++ b/storage/xtradb/rem/rem0rec.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -212,6 +212,13 @@ rec_get_n_extern_new(
const dict_col_t* col
= dict_field_get_col(field);
len = *lens--;
+ /* If the maximum length of the field is up
+ to 255 bytes, the actual length is always
+ stored in one byte. If the maximum length is
+ more than 255 bytes, the actual length is
+ stored in one byte for 0..127. The length
+ will be encoded in two bytes when it is 128 or
+ more, or when the field is stored externally. */
if (UNIV_UNLIKELY(col->len > 255)
|| UNIV_UNLIKELY(col->mtype == DATA_BLOB)) {
if (len & 0x80) {
@@ -294,6 +301,13 @@ rec_init_offsets_comp_ordinary(
const dict_col_t* col
= dict_field_get_col(field);
len = *lens--;
+ /* If the maximum length of the field is up
+ to 255 bytes, the actual length is always
+ stored in one byte. If the maximum length is
+ more than 255 bytes, the actual length is
+ stored in one byte for 0..127. The length
+ will be encoded in two bytes when it is 128 or
+ more, or when the field is stored externally. */
if (UNIV_UNLIKELY(col->len > 255)
|| UNIV_UNLIKELY(col->mtype
== DATA_BLOB)) {
@@ -425,6 +439,15 @@ rec_init_offsets(
const dict_col_t* col
= dict_field_get_col(field);
len = *lens--;
+ /* If the maximum length of the field
+ is up to 255 bytes, the actual length
+ is always stored in one byte. If the
+ maximum length is more than 255 bytes,
+ the actual length is stored in one
+ byte for 0..127. The length will be
+ encoded in two bytes when it is 128 or
+ more, or when the field is stored
+ externally. */
if (UNIV_UNLIKELY(col->len > 255)
|| UNIV_UNLIKELY(col->mtype
== DATA_BLOB)) {
@@ -647,6 +670,13 @@ rec_get_offsets_reverse(
const dict_col_t* col
= dict_field_get_col(field);
len = *lens++;
+ /* If the maximum length of the field is up
+ to 255 bytes, the actual length is always
+ stored in one byte. If the maximum length is
+ more than 255 bytes, the actual length is
+ stored in one byte for 0..127. The length
+ will be encoded in two bytes when it is 128 or
+ more, or when the field is stored externally. */
if (UNIV_UNLIKELY(col->len > 255)
|| UNIV_UNLIKELY(col->mtype == DATA_BLOB)) {
if (len & 0x80) {
@@ -695,19 +725,9 @@ rec_get_nth_field_offs_old(
ulint os;
ulint next_os;
- ut_ad(rec && len);
- ut_ad(n < rec_get_n_fields_old(rec));
-
- if (UNIV_UNLIKELY(n > REC_MAX_N_FIELDS)) {
- fprintf(stderr, "Error: trying to access field %lu in rec\n",
- (ulong) n);
- ut_error;
- }
-
- if (UNIV_UNLIKELY(rec == NULL)) {
- fputs("Error: rec is NULL pointer\n", stderr);
- ut_error;
- }
+ ut_ad(len);
+ ut_a(rec);
+ ut_a(n < rec_get_n_fields_old(rec));
if (rec_get_1byte_offs_flag(rec)) {
os = rec_1_get_field_start_offs(rec, n);
@@ -791,12 +811,20 @@ rec_get_converted_size_comp_prefix(
ut_ad(len <= col->len || col->mtype == DATA_BLOB);
+ /* If the maximum length of a variable-length field
+ is up to 255 bytes, the actual length is always stored
+ in one byte. If the maximum length is more than 255
+ bytes, the actual length is stored in one byte for
+ 0..127. The length will be encoded in two bytes when
+ it is 128 or more, or when the field is stored externally. */
+
if (field->fixed_len) {
ut_ad(len == field->fixed_len);
/* dict_index_add_col() should guarantee this */
ut_ad(!field->prefix_len
|| field->fixed_len == field->prefix_len);
} else if (dfield_is_ext(&fields[i])) {
+ ut_ad(col->len >= 256 || col->mtype == DATA_BLOB);
extra_size += 2;
} else if (len < 128
|| (col->len < 256 && col->mtype != DATA_BLOB)) {
@@ -1096,6 +1124,8 @@ rec_convert_dtuple_to_rec_comp(
/* Store the data and the offsets */
for (i = 0, field = fields; i < n_fields; i++, field++) {
+ const dict_field_t* ifield;
+
type = dfield_get_type(field);
len = dfield_get_len(field);
@@ -1130,12 +1160,20 @@ rec_convert_dtuple_to_rec_comp(
/* only nullable fields can be null */
ut_ad(!dfield_is_null(field));
- fixed_len = dict_index_get_nth_field(index, i)->fixed_len;
-
+ ifield = dict_index_get_nth_field(index, i);
+ fixed_len = ifield->fixed_len;
+ /* If the maximum length of a variable-length field
+ is up to 255 bytes, the actual length is always stored
+ in one byte. If the maximum length is more than 255
+ bytes, the actual length is stored in one byte for
+ 0..127. The length will be encoded in two bytes when
+ it is 128 or more, or when the field is stored externally. */
if (fixed_len) {
ut_ad(len == fixed_len);
ut_ad(!dfield_is_ext(field));
} else if (dfield_is_ext(field)) {
+ ut_ad(ifield->col->len >= 256
+ || ifield->col->mtype == DATA_BLOB);
ut_ad(len <= REC_MAX_INDEX_COL_LEN
+ BTR_EXTERN_FIELD_REF_SIZE);
*lens-- = (byte) (len >> 8) | 0xc0;
@@ -1225,11 +1263,20 @@ rec_convert_dtuple_to_rec(
mem_heap_t* heap = NULL;
ulint offsets_[REC_OFFS_NORMAL_SIZE];
const ulint* offsets;
+ ulint i;
rec_offs_init(offsets_);
offsets = rec_get_offsets(rec, index,
offsets_, ULINT_UNDEFINED, &heap);
ut_ad(rec_validate(rec, offsets));
+ ut_ad(dtuple_get_n_fields(dtuple)
+ == rec_offs_n_fields(offsets));
+
+ for (i = 0; i < rec_offs_n_fields(offsets); i++) {
+ ut_ad(!dfield_is_ext(dtuple_get_nth_field(dtuple, i))
+ == !rec_offs_nth_extern(offsets, i));
+ }
+
if (UNIV_LIKELY_NULL(heap)) {
mem_heap_free(heap);
}
@@ -1412,6 +1459,13 @@ rec_copy_prefix_to_buf(
prefix_len += field->fixed_len;
} else {
ulint len = *lens--;
+ /* If the maximum length of the column is up
+ to 255 bytes, the actual length is always
+ stored in one byte. If the maximum length is
+ more than 255 bytes, the actual length is
+ stored in one byte for 0..127. The length
+ will be encoded in two bytes when it is 128 or
+ more, or when the column is stored externally. */
if (col->len > 255 || col->mtype == DATA_BLOB) {
if (len & 0x80) {
/* 1exxxxxx */
diff --git a/storage/xtradb/row/row0ins.c b/storage/xtradb/row/row0ins.c
index 4b104ced649..d7475d613ad 100644
--- a/storage/xtradb/row/row0ins.c
+++ b/storage/xtradb/row/row0ins.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -1997,7 +1997,7 @@ row_ins_index_entry_low(
btr_cur_search_to_nth_level(index, 0, entry, PAGE_CUR_LE,
mode | BTR_INSERT | ignore_sec_unique,
- &cursor, 0, &mtr);
+ &cursor, 0, __FILE__, __LINE__, &mtr);
if (cursor.flag == BTR_CUR_INSERT_TO_IBUF) {
/* The insertion was made to the insert buffer already during
@@ -2055,7 +2055,8 @@ row_ins_index_entry_low(
btr_cur_search_to_nth_level(index, 0, entry,
PAGE_CUR_LE,
mode | BTR_INSERT,
- &cursor, 0, &mtr);
+ &cursor, 0,
+ __FILE__, __LINE__, &mtr);
}
}
@@ -2110,7 +2111,8 @@ function_exit:
mtr_start(&mtr);
btr_cur_search_to_nth_level(index, 0, entry, PAGE_CUR_LE,
- BTR_MODIFY_TREE, &cursor, 0, &mtr);
+ BTR_MODIFY_TREE, &cursor, 0,
+ __FILE__, __LINE__, &mtr);
rec = btr_cur_get_rec(&cursor);
offsets = rec_get_offsets(rec, index, NULL,
ULINT_UNDEFINED, &heap);
diff --git a/storage/xtradb/row/row0merge.c b/storage/xtradb/row/row0merge.c
index 93b2095dc26..6ee93d24ed3 100644
--- a/storage/xtradb/row/row0merge.c
+++ b/storage/xtradb/row/row0merge.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 2005, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2005, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -424,14 +424,13 @@ row_merge_dup_report(
row_merge_dup_t* dup, /*!< in/out: for reporting duplicates */
const dfield_t* entry) /*!< in: duplicate index entry */
{
- mrec_buf_t buf;
+ mrec_buf_t* buf;
const dtuple_t* tuple;
dtuple_t tuple_store;
const rec_t* rec;
const dict_index_t* index = dup->index;
ulint n_fields= dict_index_get_n_fields(index);
- mem_heap_t* heap = NULL;
- ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ mem_heap_t* heap;
ulint* offsets;
ulint n_ext;
@@ -441,22 +440,22 @@ row_merge_dup_report(
return;
}
- rec_offs_init(offsets_);
-
/* Convert the tuple to a record and then to MySQL format. */
+ heap = mem_heap_create((1 + REC_OFFS_HEADER_SIZE + n_fields)
+ * sizeof *offsets
+ + sizeof *buf);
+
+ buf = mem_heap_alloc(heap, sizeof *buf);
tuple = dtuple_from_fields(&tuple_store, entry, n_fields);
n_ext = dict_index_is_clust(index) ? dtuple_get_n_ext(tuple) : 0;
- rec = rec_convert_dtuple_to_rec(buf, index, tuple, n_ext);
- offsets = rec_get_offsets(rec, index, offsets_, ULINT_UNDEFINED,
- &heap);
+ rec = rec_convert_dtuple_to_rec(*buf, index, tuple, n_ext);
+ offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap);
innobase_rec_to_mysql(dup->table, rec, index, offsets);
- if (UNIV_LIKELY_NULL(heap)) {
- mem_heap_free(heap);
- }
+ mem_heap_free(heap);
}
/*************************************************************//**
@@ -627,22 +626,26 @@ row_merge_buf_write(
}
/******************************************************//**
-Create a memory heap and allocate space for row_merge_rec_offsets().
+Create a memory heap and allocate space for row_merge_rec_offsets()
+and mrec_buf_t[3].
@return memory heap */
static
mem_heap_t*
row_merge_heap_create(
/*==================*/
const dict_index_t* index, /*!< in: record descriptor */
+ mrec_buf_t** buf, /*!< out: 3 buffers */
ulint** offsets1, /*!< out: offsets */
ulint** offsets2) /*!< out: offsets */
{
ulint i = 1 + REC_OFFS_HEADER_SIZE
+ dict_index_get_n_fields(index);
- mem_heap_t* heap = mem_heap_create(2 * i * sizeof *offsets1);
+ mem_heap_t* heap = mem_heap_create(2 * i * sizeof **offsets1
+ + 3 * sizeof **buf);
- *offsets1 = mem_heap_alloc(heap, i * sizeof *offsets1);
- *offsets2 = mem_heap_alloc(heap, i * sizeof *offsets2);
+ *buf = mem_heap_alloc(heap, 3 * sizeof **buf);
+ *offsets1 = mem_heap_alloc(heap, i * sizeof **offsets1);
+ *offsets2 = mem_heap_alloc(heap, i * sizeof **offsets2);
(*offsets1)[0] = (*offsets2)[0] = i;
(*offsets1)[1] = (*offsets2)[1] = dict_index_get_n_fields(index);
@@ -1401,7 +1404,8 @@ row_merge_blocks(
{
mem_heap_t* heap; /*!< memory heap for offsets0, offsets1 */
- mrec_buf_t buf[3]; /*!< buffer for handling split mrec in block[] */
+ mrec_buf_t* buf; /*!< buffer for handling
+ split mrec in block[] */
const byte* b0; /*!< pointer to block[0] */
const byte* b1; /*!< pointer to block[1] */
byte* b2; /*!< pointer to block[2] */
@@ -1421,7 +1425,7 @@ row_merge_blocks(
}
#endif /* UNIV_DEBUG */
- heap = row_merge_heap_create(index, &offsets0, &offsets1);
+ heap = row_merge_heap_create(index, &buf, &offsets0, &offsets1);
/* Write a record and read the next record. Split the output
file in two halves, which can be merged on the following pass. */
@@ -1507,7 +1511,7 @@ row_merge_blocks_copy(
{
mem_heap_t* heap; /*!< memory heap for offsets0, offsets1 */
- mrec_buf_t buf[3]; /*!< buffer for handling
+ mrec_buf_t* buf; /*!< buffer for handling
split mrec in block[] */
const byte* b0; /*!< pointer to block[0] */
byte* b2; /*!< pointer to block[2] */
@@ -1525,7 +1529,7 @@ row_merge_blocks_copy(
}
#endif /* UNIV_DEBUG */
- heap = row_merge_heap_create(index, &offsets0, &offsets1);
+ heap = row_merge_heap_create(index, &buf, &offsets0, &offsets1);
/* Write a record and read the next record. Split the output
file in two halves, which can be merged on the following pass. */
@@ -1767,7 +1771,6 @@ row_merge_insert_index_tuples(
int fd, /*!< in: file descriptor */
row_merge_block_t* block) /*!< in/out: file buffer */
{
- mrec_buf_t buf;
const byte* b;
que_thr_t* thr;
ins_node_t* node;
@@ -1786,7 +1789,7 @@ row_merge_insert_index_tuples(
trx->op_info = "inserting index entries";
- graph_heap = mem_heap_create(500);
+ graph_heap = mem_heap_create(500 + sizeof(mrec_buf_t));
node = ins_node_create(INS_DIRECT, table, graph_heap);
thr = pars_complete_graph_for_exec(node, trx, graph_heap);
@@ -1808,12 +1811,14 @@ row_merge_insert_index_tuples(
if (!row_merge_read(fd, foffs, block)) {
error = DB_CORRUPTION;
} else {
+ mrec_buf_t* buf = mem_heap_alloc(graph_heap, sizeof *buf);
+
for (;;) {
const mrec_t* mrec;
dtuple_t* dtuple;
ulint n_ext;
- b = row_merge_read_rec(block, &buf, b, index,
+ b = row_merge_read_rec(block, buf, b, index,
fd, &foffs, &mrec, offsets);
if (UNIV_UNLIKELY(!b)) {
/* End of list, or I/O error */
@@ -1984,14 +1989,12 @@ row_merge_drop_index(
/* Drop the field definitions of the index. */
"DELETE FROM SYS_FIELDS WHERE INDEX_ID = :indexid;\n"
/* Drop the index definition and the B-tree. */
- "DELETE FROM SYS_INDEXES WHERE ID = :indexid\n"
- " AND TABLE_ID = :tableid;\n"
+ "DELETE FROM SYS_INDEXES WHERE ID = :indexid;\n"
"END;\n";
ut_ad(index && table && trx);
pars_info_add_dulint_literal(info, "indexid", index->id);
- pars_info_add_dulint_literal(info, "tableid", table->id);
trx_start_if_not_started(trx);
trx->op_info = "dropping index";
@@ -2040,47 +2043,79 @@ row_merge_drop_temp_indexes(void)
/*=============================*/
{
trx_t* trx;
- ulint err;
-
- /* We use the private SQL parser of Innobase to generate the
- query graphs needed in deleting the dictionary data from system
- tables in Innobase. Deleting a row from SYS_INDEXES table also
- frees the file segments of the B-tree associated with the index. */
- static const char drop_temp_indexes[] =
- "PROCEDURE DROP_TEMP_INDEXES_PROC () IS\n"
- "indexid CHAR;\n"
- "DECLARE CURSOR c IS SELECT ID FROM SYS_INDEXES\n"
- "WHERE SUBSTR(NAME,0,1)='" TEMP_INDEX_PREFIX_STR "';\n"
- "BEGIN\n"
- "\tOPEN c;\n"
- "\tWHILE 1=1 LOOP\n"
- "\t\tFETCH c INTO indexid;\n"
- "\t\tIF (SQL % NOTFOUND) THEN\n"
- "\t\t\tEXIT;\n"
- "\t\tEND IF;\n"
- "\t\tDELETE FROM SYS_FIELDS WHERE INDEX_ID = indexid;\n"
- "\t\tDELETE FROM SYS_INDEXES WHERE ID = indexid;\n"
- "\tEND LOOP;\n"
- "\tCLOSE c;\n"
- "\tCOMMIT WORK;\n"
- "END;\n";
+ btr_pcur_t pcur;
+ mtr_t mtr;
+ /* Load the table definitions that contain partially defined
+ indexes, so that the data dictionary information can be checked
+ when accessing the tablename.ibd files. */
trx = trx_allocate_for_background();
trx->op_info = "dropping partially created indexes";
row_mysql_lock_data_dictionary(trx);
- /* Incomplete transactions may be holding some locks on the
- data dictionary tables. However, they should never have been
- able to lock the records corresponding to the partially
- created indexes that we are attempting to delete, because the
- table was locked when the indexes were being created. We will
- drop the partially created indexes before the rollback of
- incomplete transactions is initiated. Thus, this should not
- interfere with the incomplete transactions. */
- trx->isolation_level = TRX_ISO_READ_UNCOMMITTED;
- err = que_eval_sql(NULL, drop_temp_indexes, FALSE, trx);
- ut_a(err == DB_SUCCESS);
+ mtr_start(&mtr);
+
+ btr_pcur_open_at_index_side(
+ TRUE,
+ dict_table_get_first_index(dict_sys->sys_indexes),
+ BTR_SEARCH_LEAF, &pcur, TRUE, &mtr);
+
+ for (;;) {
+ const rec_t* rec;
+ const byte* field;
+ ulint len;
+ dulint table_id;
+ dict_table_t* table;
+ btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+
+ if (!btr_pcur_is_on_user_rec(&pcur)) {
+ break;
+ }
+
+ rec = btr_pcur_get_rec(&pcur);
+ field = rec_get_nth_field_old(rec, DICT_SYS_INDEXES_NAME_FIELD,
+ &len);
+ if (len == UNIV_SQL_NULL || len == 0
+ || (char) *field != TEMP_INDEX_PREFIX) {
+ continue;
+ }
+
+ /* This is a temporary index. */
+
+ field = rec_get_nth_field_old(rec, 0/*TABLE_ID*/, &len);
+ if (len != 8) {
+ /* Corrupted TABLE_ID */
+ continue;
+ }
+
+ table_id = mach_read_from_8(field);
+
+ btr_pcur_store_position(&pcur, &mtr);
+ btr_pcur_commit_specify_mtr(&pcur, &mtr);
+
+ table = dict_load_table_on_id(table_id);
+
+ if (table) {
+ dict_index_t* index;
+
+ for (index = dict_table_get_first_index(table);
+ index; index = dict_table_get_next_index(index)) {
+
+ if (*index->name == TEMP_INDEX_PREFIX) {
+ row_merge_drop_index(index, table, trx);
+ trx_commit_for_mysql(trx);
+ }
+ }
+ }
+
+ mtr_start(&mtr);
+ btr_pcur_restore_position(BTR_SEARCH_LEAF,
+ &pcur, &mtr);
+ }
+
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
row_mysql_unlock_data_dictionary(trx);
trx_free_for_background(trx);
}
diff --git a/storage/xtradb/row/row0mysql.c b/storage/xtradb/row/row0mysql.c
index 8585b816911..e520065ea04 100644
--- a/storage/xtradb/row/row0mysql.c
+++ b/storage/xtradb/row/row0mysql.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 2000, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2000, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -485,7 +485,7 @@ next_column:
/****************************************************************//**
Handles user errors and lock waits detected by the database engine.
@return TRUE if it was a lock wait and we should continue running the
-query thread */
+query thread and in that case the thr is ALREADY in the running state. */
UNIV_INTERN
ibool
row_mysql_handle_errors(
@@ -1461,7 +1461,7 @@ row_unlock_for_mysql(
if (UNIV_UNLIKELY
(!srv_locks_unsafe_for_binlog
- && trx->isolation_level != TRX_ISO_READ_COMMITTED)) {
+ && trx->isolation_level > TRX_ISO_READ_COMMITTED)) {
fprintf(stderr,
"InnoDB: Error: calling row_unlock_for_mysql though\n"
@@ -3258,19 +3258,13 @@ check_next_foreign:
"END;\n"
, FALSE, trx);
- if (err != DB_SUCCESS) {
- ut_a(err == DB_OUT_OF_FILE_SPACE);
-
- err = DB_MUST_GET_MORE_FILE_SPACE;
-
- row_mysql_handle_errors(&err, trx, NULL, NULL);
-
- ut_error;
- } else {
- ibool is_path;
+ switch (err) {
+ ibool is_temp;
const char* name_or_path;
mem_heap_t* heap;
+ case DB_SUCCESS:
+
heap = mem_heap_create(200);
/* Clone the name, in case it has been allocated
@@ -3280,12 +3274,13 @@ check_next_foreign:
space_id = table->space;
if (table->dir_path_of_temp_table != NULL) {
- is_path = TRUE;
name_or_path = mem_heap_strdup(
heap, table->dir_path_of_temp_table);
+ is_temp = TRUE;
} else {
- is_path = FALSE;
name_or_path = name;
+ is_temp = (table->flags >> DICT_TF2_SHIFT)
+ & DICT_TF2_TEMPORARY;
}
dict_table_remove_from_cache(table);
@@ -3302,11 +3297,11 @@ check_next_foreign:
/* Do not drop possible .ibd tablespace if something went
wrong: we do not want to delete valuable data of the user */
- if (err == DB_SUCCESS && space_id > 0) {
+ if (err == DB_SUCCESS && !trx_sys_sys_space(space_id)) {
if (!fil_space_for_table_exists_in_mem(space_id,
name_or_path,
- is_path,
- FALSE, TRUE)) {
+ is_temp, FALSE,
+ !is_temp)) {
err = DB_SUCCESS;
fprintf(stderr,
@@ -3335,7 +3330,27 @@ check_next_foreign:
}
mem_heap_free(heap);
+ break;
+
+ case DB_TOO_MANY_CONCURRENT_TRXS:
+ /* Cannot even find a free slot for the
+ the undo log. We can directly exit here
+ and return the DB_TOO_MANY_CONCURRENT_TRXS
+ error. */
+ break;
+
+ case DB_OUT_OF_FILE_SPACE:
+ err = DB_MUST_GET_MORE_FILE_SPACE;
+
+ row_mysql_handle_errors(&err, trx, NULL, NULL);
+
+ /* Fall through to raise error */
+
+ default:
+ /* No other possible error returns */
+ ut_error;
}
+
funct_exit:
if (locked_dictionary) {
@@ -3351,6 +3366,90 @@ funct_exit:
return((int) err);
}
+/*********************************************************************//**
+Drop all temporary tables during crash recovery. */
+UNIV_INTERN
+void
+row_mysql_drop_temp_tables(void)
+/*============================*/
+{
+ trx_t* trx;
+ btr_pcur_t pcur;
+ mtr_t mtr;
+ mem_heap_t* heap;
+
+ trx = trx_allocate_for_background();
+ trx->op_info = "dropping temporary tables";
+ row_mysql_lock_data_dictionary(trx);
+
+ heap = mem_heap_create(200);
+
+ mtr_start(&mtr);
+
+ btr_pcur_open_at_index_side(
+ TRUE,
+ dict_table_get_first_index(dict_sys->sys_tables),
+ BTR_SEARCH_LEAF, &pcur, TRUE, &mtr);
+
+ for (;;) {
+ const rec_t* rec;
+ const byte* field;
+ ulint len;
+ const char* table_name;
+ dict_table_t* table;
+
+ btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+
+ if (!btr_pcur_is_on_user_rec(&pcur)) {
+ break;
+ }
+
+ rec = btr_pcur_get_rec(&pcur);
+ field = rec_get_nth_field_old(rec, 4/*N_COLS*/, &len);
+ if (len != 4 || !(mach_read_from_4(field) & 0x80000000UL)) {
+ continue;
+ }
+
+ /* Because this is not a ROW_FORMAT=REDUNDANT table,
+ the is_temp flag is valid. Examine it. */
+
+ field = rec_get_nth_field_old(rec, 7/*MIX_LEN*/, &len);
+ if (len != 4
+ || !(mach_read_from_4(field) & DICT_TF2_TEMPORARY)) {
+ continue;
+ }
+
+ /* This is a temporary table. */
+ field = rec_get_nth_field_old(rec, 0/*NAME*/, &len);
+ if (len == UNIV_SQL_NULL || len == 0) {
+ /* Corrupted SYS_TABLES.NAME */
+ continue;
+ }
+
+ table_name = mem_heap_strdupl(heap, (const char*) field, len);
+
+ btr_pcur_store_position(&pcur, &mtr);
+ btr_pcur_commit_specify_mtr(&pcur, &mtr);
+
+ table = dict_load_table(table_name);
+
+ if (table) {
+ row_drop_table_for_mysql(table_name, trx, FALSE);
+ trx_commit_for_mysql(trx);
+ }
+
+ mtr_start(&mtr);
+ btr_pcur_restore_position(BTR_SEARCH_LEAF,
+ &pcur, &mtr);
+ }
+
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+ mem_heap_free(heap);
+ row_mysql_unlock_data_dictionary(trx);
+ trx_free_for_background(trx);
+}
+
/*******************************************************************//**
Drop all foreign keys in a database, see Bug#18942.
Called at the end of row_drop_database_for_mysql().
@@ -3902,14 +4001,15 @@ Checks that the index contains entries in an ascending order, unique
constraint is not broken, and calculates the number of index entries
in the read view of the current transaction.
@return TRUE if ok */
-static
+UNIV_INTERN
ibool
-row_scan_and_check_index(
-/*=====================*/
- row_prebuilt_t* prebuilt, /*!< in: prebuilt struct in MySQL */
- dict_index_t* index, /*!< in: index */
- ulint* n_rows) /*!< out: number of entries seen in the
- current consistent read */
+row_check_index_for_mysql(
+/*======================*/
+ row_prebuilt_t* prebuilt, /*!< in: prebuilt struct
+ in MySQL handle */
+ const dict_index_t* index, /*!< in: index */
+ ulint* n_rows) /*!< out: number of entries
+ seen in the consistent read */
{
dtuple_t* prev_entry = NULL;
ulint matched_fields;
@@ -3930,31 +4030,9 @@ row_scan_and_check_index(
*n_rows = 0;
- if (!row_merge_is_index_usable(prebuilt->trx, index)) {
- /* A newly created index may lack some delete-marked
- records that may exist in the read view of
- prebuilt->trx. Thus, such indexes must not be
- accessed by consistent read. */
- return(is_ok);
- }
-
buf = mem_alloc(UNIV_PAGE_SIZE);
heap = mem_heap_create(100);
- /* Make a dummy template in prebuilt, which we will use
- in scanning the index entries */
-
- prebuilt->index = index;
- /* row_merge_is_index_usable() was already checked above. */
- prebuilt->index_usable = TRUE;
- prebuilt->sql_stat_start = TRUE;
- prebuilt->template_type = ROW_MYSQL_DUMMY_TEMPLATE;
- prebuilt->n_template = 0;
- prebuilt->need_to_access_clustered = FALSE;
-
- dtuple_set_n_fields(prebuilt->search_tuple, 0);
-
- prebuilt->select_lock_type = LOCK_NONE;
cnt = 1000;
ret = row_search_for_mysql(buf, PAGE_CUR_G, prebuilt, 0, 0);
@@ -4073,119 +4151,6 @@ not_ok:
}
/*********************************************************************//**
-Checks a table for corruption.
-@return DB_ERROR or DB_SUCCESS */
-UNIV_INTERN
-ulint
-row_check_table_for_mysql(
-/*======================*/
- row_prebuilt_t* prebuilt) /*!< in: prebuilt struct in MySQL
- handle */
-{
- dict_table_t* table = prebuilt->table;
- dict_index_t* index;
- ulint n_rows;
- ulint n_rows_in_table = ULINT_UNDEFINED;
- ulint ret = DB_SUCCESS;
- ulint old_isolation_level;
-
- if (table->ibd_file_missing) {
- ut_print_timestamp(stderr);
- fprintf(stderr, " InnoDB: Error:\n"
- "InnoDB: MySQL is trying to use a table handle"
- " but the .ibd file for\n"
- "InnoDB: table %s does not exist.\n"
- "InnoDB: Have you deleted the .ibd file"
- " from the database directory under\n"
- "InnoDB: the MySQL datadir, or have you"
- " used DISCARD TABLESPACE?\n"
- "InnoDB: Look from\n"
- "InnoDB: " REFMAN "innodb-troubleshooting.html\n"
- "InnoDB: how you can resolve the problem.\n",
- table->name);
- return(DB_ERROR);
- }
-
- prebuilt->trx->op_info = "checking table";
-
- old_isolation_level = prebuilt->trx->isolation_level;
-
- /* We must run the index record counts at an isolation level
- >= READ COMMITTED, because a dirty read can see a wrong number
- of records in some index; to play safe, we use always
- REPEATABLE READ here */
-
- prebuilt->trx->isolation_level = TRX_ISO_REPEATABLE_READ;
-
- /* Enlarge the fatal lock wait timeout during CHECK TABLE. */
- mutex_enter(&kernel_mutex);
- srv_fatal_semaphore_wait_threshold += 7200; /* 2 hours */
- mutex_exit(&kernel_mutex);
-
- index = dict_table_get_first_index(table);
-
- while (index != NULL) {
- /* fputs("Validating index ", stderr);
- ut_print_name(stderr, trx, FALSE, index->name);
- putc('\n', stderr); */
-
- if (!btr_validate_index(index, prebuilt->trx)) {
- ret = DB_ERROR;
- } else {
- if (!row_scan_and_check_index(prebuilt,index, &n_rows)){
- ret = DB_ERROR;
- }
-
- if (trx_is_interrupted(prebuilt->trx)) {
- ret = DB_INTERRUPTED;
- break;
- }
-
- /* fprintf(stderr, "%lu entries in index %s\n", n_rows,
- index->name); */
-
- if (index == dict_table_get_first_index(table)) {
- n_rows_in_table = n_rows;
- } else if (n_rows != n_rows_in_table) {
-
- ret = DB_ERROR;
-
- fputs("Error: ", stderr);
- dict_index_name_print(stderr,
- prebuilt->trx, index);
- fprintf(stderr,
- " contains %lu entries,"
- " should be %lu\n",
- (ulong) n_rows,
- (ulong) n_rows_in_table);
- }
- }
-
- index = dict_table_get_next_index(index);
- }
-
- /* Restore the original isolation level */
- prebuilt->trx->isolation_level = old_isolation_level;
-
- /* We validate also the whole adaptive hash index for all tables
- at every CHECK TABLE */
-
- if (!btr_search_validate()) {
-
- ret = DB_ERROR;
- }
-
- /* Restore the fatal lock wait timeout after CHECK TABLE. */
- mutex_enter(&kernel_mutex);
- srv_fatal_semaphore_wait_threshold -= 7200; /* 2 hours */
- mutex_exit(&kernel_mutex);
-
- prebuilt->trx->op_info = "";
-
- return(ret);
-}
-
-/*********************************************************************//**
Determines if a table is a magic monitor table.
@return TRUE if monitor table */
UNIV_INTERN
diff --git a/storage/xtradb/row/row0row.c b/storage/xtradb/row/row0row.c
index 128ac3ba3e8..cb7dfa2b7c9 100644
--- a/storage/xtradb/row/row0row.c
+++ b/storage/xtradb/row/row0row.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -915,6 +915,10 @@ row_raw_format(
ret = row_raw_format_int(data, data_len, prtype,
buf, buf_size, &format_in_hex);
+ if (format_in_hex) {
+
+ goto format_in_hex;
+ }
break;
case DATA_CHAR:
case DATA_VARCHAR:
@@ -923,14 +927,15 @@ row_raw_format(
ret = row_raw_format_str(data, data_len, prtype,
buf, buf_size, &format_in_hex);
+ if (format_in_hex) {
+
+ goto format_in_hex;
+ }
+
break;
/* XXX support more data types */
default:
-
- format_in_hex = TRUE;
- }
-
- if (format_in_hex) {
+ format_in_hex:
if (UNIV_LIKELY(buf_size > 2)) {
diff --git a/storage/xtradb/row/row0sel.c b/storage/xtradb/row/row0sel.c
index 1d5d525cca6..000cc3586fd 100644
--- a/storage/xtradb/row/row0sel.c
+++ b/storage/xtradb/row/row0sel.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2010, Innobase Oy. All Rights Reserved.
Copyright (c) 2008, Google Inc.
Portions of this file contain modifications contributed and copyrighted by
@@ -132,7 +132,8 @@ index record.
NOTE: the comparison is NOT done as a binary comparison, but character
fields are compared with collation!
@return TRUE if the secondary record is equal to the corresponding
-fields in the clustered record, when compared with collation */
+fields in the clustered record, when compared with collation;
+FALSE if not equal or if the clustered record has been marked for deletion */
static
ibool
row_sel_sec_rec_is_for_clust_rec(
@@ -431,10 +432,6 @@ row_sel_fetch_columns(
data = rec_get_nth_field(rec, offsets,
field_no, &len);
- if (len == UNIV_SQL_NULL) {
- len = UNIV_SQL_NULL;
- }
-
needs_copy = column->copy_val;
}
@@ -855,7 +852,7 @@ row_sel_get_clust_rec(
trx = thr_get_trx(thr);
if (srv_locks_unsafe_for_binlog
- || trx->isolation_level == TRX_ISO_READ_COMMITTED) {
+ || trx->isolation_level <= TRX_ISO_READ_COMMITTED) {
lock_type = LOCK_REC_NOT_GAP;
} else {
lock_type = LOCK_ORDINARY;
@@ -1468,7 +1465,7 @@ rec_loop:
if (srv_locks_unsafe_for_binlog
|| trx->isolation_level
- == TRX_ISO_READ_COMMITTED) {
+ <= TRX_ISO_READ_COMMITTED) {
if (page_rec_is_supremum(next_rec)) {
@@ -1525,7 +1522,7 @@ skip_lock:
trx = thr_get_trx(thr);
if (srv_locks_unsafe_for_binlog
- || trx->isolation_level == TRX_ISO_READ_COMMITTED) {
+ || trx->isolation_level <= TRX_ISO_READ_COMMITTED) {
if (page_rec_is_supremum(rec)) {
@@ -2170,36 +2167,6 @@ row_fetch_print(
return((void*)42);
}
-/****************************************************************//**
-Callback function for fetch that stores an unsigned 4 byte integer to the
-location pointed. The column's type must be DATA_INT, DATA_UNSIGNED, length
-= 4.
-@return always returns NULL */
-UNIV_INTERN
-void*
-row_fetch_store_uint4(
-/*==================*/
- void* row, /*!< in: sel_node_t* */
- void* user_arg) /*!< in: data pointer */
-{
- sel_node_t* node = row;
- ib_uint32_t* val = user_arg;
- ulint tmp;
-
- dfield_t* dfield = que_node_get_val(node->select_list);
- const dtype_t* type = dfield_get_type(dfield);
- ulint len = dfield_get_len(dfield);
-
- ut_a(dtype_get_mtype(type) == DATA_INT);
- ut_a(dtype_get_prtype(type) & DATA_UNSIGNED);
- ut_a(len == 4);
-
- tmp = mach_read_from_4(dfield_get_data(dfield));
- *val = (ib_uint32_t) tmp;
-
- return(NULL);
-}
-
/***********************************************************//**
Prints a row in a select result.
@return query thread to run next or NULL */
@@ -2981,6 +2948,7 @@ row_sel_get_clust_rec_for_mysql(
if (clust_rec
&& (old_vers
+ || trx->isolation_level <= TRX_ISO_READ_UNCOMMITTED
|| rec_get_deleted_flag(rec, dict_table_is_comp(
sec_index->table)))
&& !row_sel_sec_rec_is_for_clust_rec(
@@ -3202,14 +3170,17 @@ row_sel_try_search_shortcut_for_mysql(
ut_ad(dict_index_is_clust(index));
ut_ad(!prebuilt->templ_contains_blob);
+#ifndef UNIV_SEARCH_DEBUG
btr_pcur_open_with_no_init(index, search_tuple, PAGE_CUR_GE,
BTR_SEARCH_LEAF, pcur,
-#ifndef UNIV_SEARCH_DEBUG
RW_S_LATCH,
-#else
+ mtr);
+#else /* UNIV_SEARCH_DEBUG */
+ btr_pcur_open_with_no_init(index, search_tuple, PAGE_CUR_GE,
+ BTR_SEARCH_LEAF, pcur,
0,
-#endif
mtr);
+#endif /* UNIV_SEARCH_DEBUG */
rec = btr_pcur_get_rec(pcur);
if (!page_rec_is_user_rec(rec)) {
@@ -3694,7 +3665,7 @@ shortcut_fails_too_big_rec:
&& !page_rec_is_supremum(rec)
&& set_also_gap_locks
&& !(srv_locks_unsafe_for_binlog
- || trx->isolation_level == TRX_ISO_READ_COMMITTED)
+ || trx->isolation_level <= TRX_ISO_READ_COMMITTED)
&& prebuilt->select_lock_type != LOCK_NONE) {
/* Try to place a gap lock on the next index record
@@ -3797,7 +3768,7 @@ rec_loop:
if (set_also_gap_locks
&& !(srv_locks_unsafe_for_binlog
- || trx->isolation_level == TRX_ISO_READ_COMMITTED)
+ || trx->isolation_level <= TRX_ISO_READ_COMMITTED)
&& prebuilt->select_lock_type != LOCK_NONE) {
/* Try to place a lock on the index record */
@@ -3931,7 +3902,7 @@ wrong_offs:
if (set_also_gap_locks
&& !(srv_locks_unsafe_for_binlog
|| trx->isolation_level
- == TRX_ISO_READ_COMMITTED)
+ <= TRX_ISO_READ_COMMITTED)
&& prebuilt->select_lock_type != LOCK_NONE) {
/* Try to place a gap lock on the index
@@ -3967,7 +3938,7 @@ wrong_offs:
if (set_also_gap_locks
&& !(srv_locks_unsafe_for_binlog
|| trx->isolation_level
- == TRX_ISO_READ_COMMITTED)
+ <= TRX_ISO_READ_COMMITTED)
&& prebuilt->select_lock_type != LOCK_NONE) {
/* Try to place a gap lock on the index
@@ -4015,7 +3986,7 @@ wrong_offs:
if (!set_also_gap_locks
|| srv_locks_unsafe_for_binlog
- || trx->isolation_level == TRX_ISO_READ_COMMITTED
+ || trx->isolation_level <= TRX_ISO_READ_COMMITTED
|| (unique_search
&& !UNIV_UNLIKELY(rec_get_deleted_flag(rec, comp)))) {
@@ -4054,7 +4025,7 @@ no_gap_lock:
const rec_t* old_vers;
case DB_SUCCESS:
if (srv_locks_unsafe_for_binlog
- || trx->isolation_level == TRX_ISO_READ_COMMITTED) {
+ || trx->isolation_level <= TRX_ISO_READ_COMMITTED) {
/* Note that a record of
prebuilt->index was locked. */
prebuilt->new_rec_locks = 1;
@@ -4063,6 +4034,7 @@ no_gap_lock:
case DB_LOCK_WAIT:
if (UNIV_LIKELY(prebuilt->row_read_type
!= ROW_READ_TRY_SEMI_CONSISTENT)
+ || unique_search
|| index != clust_index) {
goto lock_wait_or_error;
@@ -4186,7 +4158,7 @@ no_gap_lock:
/* The record is delete-marked: we can skip it */
if ((srv_locks_unsafe_for_binlog
- || trx->isolation_level == TRX_ISO_READ_COMMITTED)
+ || trx->isolation_level <= TRX_ISO_READ_COMMITTED)
&& prebuilt->select_lock_type != LOCK_NONE
&& !did_semi_consistent_read) {
@@ -4253,7 +4225,7 @@ requires_clust_rec:
}
if ((srv_locks_unsafe_for_binlog
- || trx->isolation_level == TRX_ISO_READ_COMMITTED)
+ || trx->isolation_level <= TRX_ISO_READ_COMMITTED)
&& prebuilt->select_lock_type != LOCK_NONE) {
/* Note that both the secondary index record
and the clustered index record were locked. */
@@ -4266,7 +4238,7 @@ requires_clust_rec:
/* The record is delete marked: we can skip it */
if ((srv_locks_unsafe_for_binlog
- || trx->isolation_level == TRX_ISO_READ_COMMITTED)
+ || trx->isolation_level <= TRX_ISO_READ_COMMITTED)
&& prebuilt->select_lock_type != LOCK_NONE) {
/* No need to keep a lock on a delete-marked
@@ -4477,7 +4449,7 @@ lock_wait_or_error:
moves_up, &mtr);
if ((srv_locks_unsafe_for_binlog
- || trx->isolation_level == TRX_ISO_READ_COMMITTED)
+ || trx->isolation_level <= TRX_ISO_READ_COMMITTED)
&& !same_user_rec) {
/* Since we were not able to restore the cursor
@@ -4641,7 +4613,6 @@ row_search_autoinc_read_column(
ut_a(len != UNIV_SQL_NULL);
- /* we assume AUTOINC value cannot be negative */
switch (mtype) {
case DATA_INT:
ut_a(len <= sizeof value);
@@ -4650,12 +4621,12 @@ row_search_autoinc_read_column(
case DATA_FLOAT:
ut_a(len == sizeof(float));
- value = mach_float_read(data);
+ value = (ib_uint64_t) mach_float_read(data);
break;
case DATA_DOUBLE:
ut_a(len == sizeof(double));
- value = mach_double_read(data);
+ value = (ib_uint64_t) mach_double_read(data);
break;
default:
diff --git a/storage/xtradb/row/row0umod.c b/storage/xtradb/row/row0umod.c
index 6be475d8c78..e7245dbee41 100644
--- a/storage/xtradb/row/row0umod.c
+++ b/storage/xtradb/row/row0umod.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -144,13 +144,17 @@ row_undo_mod_clust_low(
/***********************************************************//**
Removes a clustered index record after undo if possible.
+This is attempted when the record was inserted by updating a
+delete-marked record and there no longer exist transactions
+that would see the delete-marked record. In other words, we
+roll back the insert by purging the record.
@return DB_SUCCESS, DB_FAIL, or error code: we may run out of file space */
static
ulint
row_undo_mod_remove_clust_low(
/*==========================*/
undo_node_t* node, /*!< in: row undo node */
- que_thr_t* thr __attribute__((unused)), /*!< in: query thread */
+ que_thr_t* thr, /*!< in: query thread */
mtr_t* mtr, /*!< in: mtr */
ulint mode) /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */
{
@@ -159,6 +163,7 @@ row_undo_mod_remove_clust_low(
ulint err;
ibool success;
+ ut_ad(node->rec_type == TRX_UNDO_UPD_DEL_REC);
pcur = &(node->pcur);
btr_cur = btr_pcur_get_btr_cur(pcur);
@@ -190,11 +195,13 @@ row_undo_mod_remove_clust_low(
} else {
ut_ad(mode == BTR_MODIFY_TREE);
- /* Note that since this operation is analogous to purge,
- we can free also inherited externally stored fields:
- hence the RB_NONE in the call below */
+ /* This operation is analogous to purge, we can free also
+ inherited externally stored fields */
- btr_cur_pessimistic_delete(&err, FALSE, btr_cur, RB_NONE, mtr);
+ btr_cur_pessimistic_delete(&err, FALSE, btr_cur,
+ thr_is_recv(thr)
+ ? RB_RECOVERY_PURGE_REC
+ : RB_NONE, mtr);
/* The delete operation may fail if we have little
file space left: TODO: easiest to crash the database
@@ -370,10 +377,11 @@ row_undo_mod_del_mark_or_remove_sec_low(
} else {
ut_ad(mode == BTR_MODIFY_TREE);
- /* No need to distinguish RB_RECOVERY here, because we
- are deleting a secondary index record: the distinction
- between RB_NORMAL and RB_RECOVERY only matters when
- deleting a record that contains externally stored
+ /* No need to distinguish RB_RECOVERY_PURGE here,
+ because we are deleting a secondary index record:
+ the distinction between RB_NORMAL and
+ RB_RECOVERY_PURGE only matters when deleting a
+ record that contains externally stored
columns. */
ut_ad(!dict_index_is_clust(index));
btr_cur_pessimistic_delete(&err, FALSE, btr_cur,
@@ -438,7 +446,7 @@ row_undo_mod_del_unmark_sec_and_undo_update(
BTR_MODIFY_TREE */
que_thr_t* thr, /*!< in: query thread */
dict_index_t* index, /*!< in: index */
- dtuple_t* entry) /*!< in: index entry */
+ const dtuple_t* entry) /*!< in: index entry */
{
mem_heap_t* heap;
btr_pcur_t pcur;
@@ -533,6 +541,7 @@ row_undo_mod_upd_del_sec(
dict_index_t* index;
ulint err = DB_SUCCESS;
+ ut_ad(node->rec_type == TRX_UNDO_UPD_DEL_REC);
heap = mem_heap_create(1024);
while (node->index != NULL) {
@@ -550,7 +559,7 @@ row_undo_mod_upd_del_sec(
does not exist. However, this situation may
only occur during the rollback of incomplete
transactions. */
- ut_a(trx_is_recv(thr_get_trx(thr)));
+ ut_a(thr_is_recv(thr));
} else {
err = row_undo_mod_del_mark_or_remove_sec(
node, thr, index, entry);
diff --git a/storage/xtradb/row/row0upd.c b/storage/xtradb/row/row0upd.c
index 58dfd43ead9..95d1d00aeef 100644
--- a/storage/xtradb/row/row0upd.c
+++ b/storage/xtradb/row/row0upd.c
@@ -1344,9 +1344,6 @@ row_upd_copy_columns(
data = rec_get_nth_field(rec, offsets,
column->field_nos[SYM_CLUST_FIELD_NO],
&len);
- if (len == UNIV_SQL_NULL) {
- len = UNIV_SQL_NULL;
- }
eval_node_copy_and_alloc_val(column, data, len);
column = UT_LIST_GET_NEXT(col_var_list, column);
diff --git a/storage/xtradb/srv/srv0srv.c b/storage/xtradb/srv/srv0srv.c
index e655c4e844d..86bf309bac1 100644
--- a/storage/xtradb/srv/srv0srv.c
+++ b/storage/xtradb/srv/srv0srv.c
@@ -1,7 +1,8 @@
/*****************************************************************************
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
Copyright (c) 2008, 2009 Google Inc.
+Copyright (c) 2009, Percona Inc.
Portions of this file contain modifications contributed and copyrighted by
Google, Inc. Those modifications are gratefully acknowledged and are described
@@ -9,6 +10,13 @@ briefly in the InnoDB documentation. The contributions by Google are
incorporated with their permission, and subject to the conditions contained in
the file COPYING.Google.
+Portions of this file contain modifications contributed and copyrighted
+by Percona Inc.. Those modifications are
+gratefully acknowledged and are described briefly in the InnoDB
+documentation. The contributions by Percona Inc. are incorporated with
+their permission, and subject to the conditions contained in the file
+COPYING.Percona.
+
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
Foundation; version 2 of the License.
@@ -22,32 +30,6 @@ this program; if not, write to the Free Software Foundation, Inc., 59 Temple
Place, Suite 330, Boston, MA 02111-1307 USA
*****************************************************************************/
-/***********************************************************************
-
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
-Copyright (c) 2009, Percona Inc.
-
-Portions of this file contain modifications contributed and copyrighted
-by Percona Inc.. Those modifications are
-gratefully acknowledged and are described briefly in the InnoDB
-documentation. The contributions by Percona Inc. are incorporated with
-their permission, and subject to the conditions contained in the file
-COPYING.Percona.
-
-This program is free software; you can redistribute it and/or modify it
-under the terms of the GNU General Public License as published by the
-Free Software Foundation; version 2 of the License.
-
-This program is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
-Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-
-***********************************************************************/
/**************************************************//**
@file srv/srv0srv.c
@@ -122,7 +104,8 @@ UNIV_INTERN ulint srv_fatal_semaphore_wait_threshold = 600;
in microseconds, in order to reduce the lagging of the purge thread. */
UNIV_INTERN ulint srv_dml_needed_delay = 0;
-UNIV_INTERN ibool srv_lock_timeout_and_monitor_active = FALSE;
+UNIV_INTERN ibool srv_lock_timeout_active = FALSE;
+UNIV_INTERN ibool srv_monitor_active = FALSE;
UNIV_INTERN ibool srv_error_monitor_active = FALSE;
UNIV_INTERN const char* srv_main_thread_op_info = "";
@@ -162,9 +145,10 @@ UNIV_INTERN char** srv_data_file_names = NULL;
/* size in database pages */
UNIV_INTERN ulint* srv_data_file_sizes = NULL;
+UNIV_INTERN char* srv_doublewrite_file = NULL;
+
UNIV_INTERN ibool srv_extra_undoslots = FALSE;
-UNIV_INTERN ibool srv_fast_recovery = FALSE;
UNIV_INTERN ibool srv_recovery_stats = FALSE;
UNIV_INTERN ulint srv_use_purge_thread = 0;
@@ -198,11 +182,20 @@ UNIV_INTERN ulong srv_flush_log_at_trx_commit = 1;
the checkpoints. */
UNIV_INTERN char srv_adaptive_flushing = TRUE;
-UNIV_INTERN ulong srv_show_locks_held = 10;
-UNIV_INTERN ulong srv_show_verbose_locks = 0;
+UNIV_INTERN ulong srv_show_locks_held = 10;
+UNIV_INTERN ulong srv_show_verbose_locks = 0;
+/** Maximum number of times allowed to conditionally acquire
+mutex before switching to blocking wait on the mutex */
+#define MAX_MUTEX_NOWAIT 20
-/* The sort order table of the MySQL latin1_swedish_ci character set
+/** Check whether the number of failed nonblocking mutex
+acquisition attempts exceeds maximum allowed value. If so,
+srv_printf_innodb_monitor() will request mutex acquisition
+with mutex_enter(), which will wait until it gets the mutex. */
+#define MUTEX_NOWAIT(mutex_skipped) ((mutex_skipped) < MAX_MUTEX_NOWAIT)
+
+/** The sort order table of the MySQL latin1_swedish_ci character set
collation */
UNIV_INTERN const byte* srv_latin1_ordering;
@@ -410,7 +403,6 @@ UNIV_INTERN ulong srv_read_ahead = 3; /* 1: random 2: linear 3: Both */
UNIV_INTERN ulong srv_adaptive_checkpoint = 0; /* 0: none 1: reflex 2: estimate */
UNIV_INTERN ulong srv_expand_import = 0; /* 0:disable 1:enable */
-UNIV_INTERN ulint srv_relax_table_creation = 0; /* 0:disable 1:enable */
UNIV_INTERN ulint srv_pass_corrupt_table = 0; /* 0:disable 1:enable */
UNIV_INTERN ulong srv_extra_rsegments = 0; /* extra rseg for users */
@@ -439,7 +431,7 @@ static ulint srv_n_rows_inserted_old = 0;
static ulint srv_n_rows_updated_old = 0;
static ulint srv_n_rows_deleted_old = 0;
static ulint srv_n_rows_read_old = 0;
-
+UNIV_INTERN ulint srv_n_lock_deadlock_count = 0;
UNIV_INTERN ulint srv_n_lock_wait_count = 0;
UNIV_INTERN ulint srv_n_lock_wait_current_count = 0;
UNIV_INTERN ib_int64_t srv_n_lock_wait_time = 0;
@@ -1766,8 +1758,9 @@ srv_suspend_mysql_thread(
innodb_lock_wait_timeout, because trx->mysql_thd == NULL. */
lock_wait_timeout = thd_lock_wait_timeout(trx->mysql_thd);
- if (lock_wait_timeout < 100000000
- && wait_time > (double) lock_wait_timeout) {
+ if (trx_is_interrupted(trx)
+ || (lock_wait_timeout < 100000000
+ && wait_time > (double) lock_wait_timeout)) {
trx->error_state = DB_LOCK_WAIT_TIMEOUT;
}
@@ -1833,12 +1826,15 @@ srv_refresh_innodb_monitor_stats(void)
}
/******************************************************************//**
-Outputs to a file the output of the InnoDB Monitor. */
+Outputs to a file the output of the InnoDB Monitor.
+@return FALSE if not all information printed
+due to failure to obtain necessary mutex */
UNIV_INTERN
-void
+ibool
srv_printf_innodb_monitor(
/*======================*/
FILE* file, /*!< in: output stream */
+ ibool nowait, /*!< in: whether to wait for kernel mutex */
ulint* trx_start, /*!< out: file position of the start of
the list of active transactions */
ulint* trx_end) /*!< out: file position of the end of
@@ -1847,6 +1843,7 @@ srv_printf_innodb_monitor(
double time_elapsed;
time_t current_time;
ulint n_reserved;
+ ibool ret;
ulint btr_search_sys_subtotal;
ulint lock_sys_subtotal;
@@ -1877,9 +1874,9 @@ srv_printf_innodb_monitor(
"Per second averages calculated from the last %lu seconds\n",
(ulong)time_elapsed);
- fputs("----------\n"
- "BACKGROUND THREAD\n"
- "----------\n", file);
+ fputs("-----------------\n"
+ "BACKGROUND THREAD\n"
+ "-----------------\n", file);
srv_print_master_thread_info(file);
fputs("----------\n"
@@ -2069,22 +2066,28 @@ srv_printf_innodb_monitor(
srv_n_rows_deleted_old = srv_n_rows_deleted;
srv_n_rows_read_old = srv_n_rows_read;
- lock_print_info_summary(file);
- if (trx_start) {
- long t = ftell(file);
- if (t < 0) {
- *trx_start = ULINT_UNDEFINED;
- } else {
- *trx_start = (ulint) t;
+ /* Only if lock_print_info_summary proceeds correctly,
+ before we call the lock_print_info_all_transactions
+ to print all the lock information. */
+ ret = lock_print_info_summary(file, nowait);
+
+ if (ret) {
+ if (trx_start) {
+ long t = ftell(file);
+ if (t < 0) {
+ *trx_start = ULINT_UNDEFINED;
+ } else {
+ *trx_start = (ulint) t;
+ }
}
- }
- lock_print_info_all_transactions(file);
- if (trx_end) {
- long t = ftell(file);
- if (t < 0) {
- *trx_end = ULINT_UNDEFINED;
- } else {
- *trx_end = (ulint) t;
+ lock_print_info_all_transactions(file);
+ if (trx_end) {
+ long t = ftell(file);
+ if (t < 0) {
+ *trx_end = ULINT_UNDEFINED;
+ } else {
+ *trx_end = (ulint) t;
+ }
}
}
@@ -2093,6 +2096,8 @@ srv_printf_innodb_monitor(
"============================\n", file);
mutex_exit(&srv_innodb_monitor_mutex);
fflush(file);
+
+ return(ret);
}
/******************************************************************//**
@@ -2133,6 +2138,8 @@ srv_export_innodb_status(void)
= UT_LIST_GET_LEN(buf_pool->flush_list);
export_vars.innodb_buffer_pool_pages_free
= UT_LIST_GET_LEN(buf_pool->free);
+ export_vars.innodb_deadlocks
+ = srv_n_lock_deadlock_count;
#ifdef UNIV_DEBUG
export_vars.innodb_buffer_pool_pages_latched
= buf_get_latched_pages_number();
@@ -2181,26 +2188,23 @@ srv_export_innodb_status(void)
}
/*********************************************************************//**
-A thread which wakes up threads whose lock wait may have lasted too long.
-This also prints the info output by various InnoDB monitors.
+A thread which prints the info output by various InnoDB monitors.
@return a dummy parameter */
UNIV_INTERN
os_thread_ret_t
-srv_lock_timeout_and_monitor_thread(
-/*================================*/
+srv_monitor_thread(
+/*===============*/
void* arg __attribute__((unused)))
/*!< in: a dummy parameter required by
os_thread_create */
{
- srv_slot_t* slot;
double time_elapsed;
time_t current_time;
time_t last_table_monitor_time;
time_t last_tablespace_monitor_time;
time_t last_monitor_time;
- ibool some_waits;
- double wait_time;
- ulint i;
+ ulint mutex_skipped;
+ ibool last_srv_print_monitor;
#ifdef UNIV_DEBUG_THREAD_CREATION
fprintf(stderr, "Lock timeout thread starts, id %lu\n",
@@ -2211,13 +2215,15 @@ srv_lock_timeout_and_monitor_thread(
last_table_monitor_time = time(NULL);
last_tablespace_monitor_time = time(NULL);
last_monitor_time = time(NULL);
+ mutex_skipped = 0;
+ last_srv_print_monitor = srv_print_innodb_monitor;
loop:
- srv_lock_timeout_and_monitor_active = TRUE;
+ srv_monitor_active = TRUE;
- /* When someone is waiting for a lock, we wake up every second
- and check if a timeout has passed for a lock wait */
+ /* Wake up every 5 seconds to see if we need to print
+ monitor information. */
- os_thread_sleep(1000000);
+ os_thread_sleep(5000000);
current_time = time(NULL);
@@ -2227,14 +2233,40 @@ loop:
last_monitor_time = time(NULL);
if (srv_print_innodb_monitor) {
- srv_printf_innodb_monitor(stderr, NULL, NULL);
+ /* Reset mutex_skipped counter everytime
+ srv_print_innodb_monitor changes. This is to
+ ensure we will not be blocked by kernel_mutex
+ for short duration information printing,
+ such as requested by sync_array_print_long_waits() */
+ if (!last_srv_print_monitor) {
+ mutex_skipped = 0;
+ last_srv_print_monitor = TRUE;
+ }
+
+ if (!srv_printf_innodb_monitor(stderr,
+ MUTEX_NOWAIT(mutex_skipped),
+ NULL, NULL)) {
+ mutex_skipped++;
+ } else {
+ /* Reset the counter */
+ mutex_skipped = 0;
+ }
+ } else {
+ last_srv_print_monitor = FALSE;
}
+
if (srv_innodb_status) {
mutex_enter(&srv_monitor_file_mutex);
rewind(srv_monitor_file);
- srv_printf_innodb_monitor(srv_monitor_file, NULL,
- NULL);
+ if (!srv_printf_innodb_monitor(srv_monitor_file,
+ MUTEX_NOWAIT(mutex_skipped),
+ NULL, NULL)) {
+ mutex_skipped++;
+ } else {
+ mutex_skipped = 0;
+ }
+
os_file_set_eof(srv_monitor_file);
mutex_exit(&srv_monitor_file_mutex);
}
@@ -2287,6 +2319,56 @@ loop:
}
}
+ if (srv_shutdown_state >= SRV_SHUTDOWN_CLEANUP) {
+ goto exit_func;
+ }
+
+ if (srv_print_innodb_monitor
+ || srv_print_innodb_lock_monitor
+ || srv_print_innodb_tablespace_monitor
+ || srv_print_innodb_table_monitor) {
+ goto loop;
+ }
+
+ srv_monitor_active = FALSE;
+
+ goto loop;
+
+exit_func:
+ srv_monitor_active = FALSE;
+
+ /* We count the number of threads in os_thread_exit(). A created
+ thread should always use that to exit and not use return() to exit. */
+
+ os_thread_exit(NULL);
+
+ OS_THREAD_DUMMY_RETURN;
+}
+
+/*********************************************************************//**
+A thread which wakes up threads whose lock wait may have lasted too long.
+@return a dummy parameter */
+UNIV_INTERN
+os_thread_ret_t
+srv_lock_timeout_thread(
+/*====================*/
+ void* arg __attribute__((unused)))
+ /* in: a dummy parameter required by
+ os_thread_create */
+{
+ srv_slot_t* slot;
+ ibool some_waits;
+ double wait_time;
+ ulint i;
+
+loop:
+ /* When someone is waiting for a lock, we wake up every second
+ and check if a timeout has passed for a lock wait */
+
+ os_thread_sleep(1000000);
+
+ srv_lock_timeout_active = TRUE;
+
mutex_enter(&kernel_mutex);
some_waits = FALSE;
@@ -2310,9 +2392,10 @@ loop:
lock_wait_timeout = thd_lock_wait_timeout(
trx->mysql_thd);
- if (lock_wait_timeout < 100000000
- && (wait_time > (double) lock_wait_timeout
- || wait_time < 0)) {
+ if (trx_is_interrupted(trx)
+ || (lock_wait_timeout < 100000000
+ && (wait_time > (double) lock_wait_timeout
+ || wait_time < 0))) {
/* Timeout exceeded or a wrap-around in system
time counter: cancel the lock request queued
@@ -2337,17 +2420,11 @@ loop:
goto exit_func;
}
- if (some_waits || srv_print_innodb_monitor
- || srv_print_innodb_lock_monitor
- || srv_print_innodb_tablespace_monitor
- || srv_print_innodb_table_monitor) {
+ if (some_waits) {
goto loop;
}
- /* No one was waiting for a lock and no monitor was active:
- suspend this thread */
-
- srv_lock_timeout_and_monitor_active = FALSE;
+ srv_lock_timeout_active = FALSE;
#if 0
/* The following synchronisation is disabled, since
@@ -2357,7 +2434,7 @@ loop:
goto loop;
exit_func:
- srv_lock_timeout_and_monitor_active = FALSE;
+ srv_lock_timeout_active = FALSE;
/* We count the number of threads in os_thread_exit(). A created
thread should always use that to exit and not use return() to exit. */
@@ -2706,7 +2783,10 @@ loop:
BUF_FLUSH_LIST,
n_flush,
IB_ULONGLONG_MAX);
- skip_sleep = TRUE;
+
+ if (n_flush == PCT_IO(100)) {
+ skip_sleep = TRUE;
+ }
}
mutex_enter(&(log_sys->mutex));
diff --git a/storage/xtradb/srv/srv0start.c b/storage/xtradb/srv/srv0start.c
index dcc13ea17b6..7b5581a24f0 100644
--- a/storage/xtradb/srv/srv0start.c
+++ b/storage/xtradb/srv/srv0start.c
@@ -1,7 +1,8 @@
/*****************************************************************************
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
Copyright (c) 2008, Google Inc.
+Copyright (c) 2009, Percona Inc.
Portions of this file contain modifications contributed and copyrighted by
Google, Inc. Those modifications are gratefully acknowledged and are described
@@ -9,6 +10,13 @@ briefly in the InnoDB documentation. The contributions by Google are
incorporated with their permission, and subject to the conditions contained in
the file COPYING.Google.
+Portions of this file contain modifications contributed and copyrighted
+by Percona Inc.. Those modifications are
+gratefully acknowledged and are described briefly in the InnoDB
+documentation. The contributions by Percona Inc. are incorporated with
+their permission, and subject to the conditions contained in the file
+COPYING.Percona.
+
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
Foundation; version 2 of the License.
@@ -22,32 +30,6 @@ this program; if not, write to the Free Software Foundation, Inc., 59 Temple
Place, Suite 330, Boston, MA 02111-1307 USA
*****************************************************************************/
-/***********************************************************************
-
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
-Copyright (c) 2009, Percona Inc.
-
-Portions of this file contain modifications contributed and copyrighted
-by Percona Inc.. Those modifications are
-gratefully acknowledged and are described briefly in the InnoDB
-documentation. The contributions by Percona Inc. are incorporated with
-their permission, and subject to the conditions contained in the file
-COPYING.Percona.
-
-This program is free software; you can redistribute it and/or modify it
-under the terms of the GNU General Public License as published by the
-Free Software Foundation; version 2 of the License.
-
-This program is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
-Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-
-***********************************************************************/
/********************************************************************//**
@file srv/srv0start.c
@@ -105,6 +87,7 @@ Created 2/16/1996 Heikki Tuuri
# include "btr0pcur.h"
# include "thr0loc.h"
# include "os0sync.h" /* for INNODB_RW_LOCKS_USE_ATOMICS */
+# include "zlib.h" /* for ZLIB_VERSION */
/** Log sequence number immediately after startup */
UNIV_INTERN ib_uint64_t srv_start_lsn;
@@ -143,9 +126,9 @@ static mutex_t ios_mutex;
static ulint ios;
/** io_handler_thread parameters for thread identification */
-static ulint n[SRV_MAX_N_IO_THREADS + 5 + 64];
+static ulint n[SRV_MAX_N_IO_THREADS + 6 + 64];
/** io_handler_thread identifiers */
-static os_thread_id_t thread_ids[SRV_MAX_N_IO_THREADS + 5 + 64];
+static os_thread_id_t thread_ids[SRV_MAX_N_IO_THREADS + 6 + 64];
/** We use this mutex to test the return value of pthread_mutex_trylock
on successful locking. HP-UX does NOT return 0, though Linux et al do. */
@@ -728,6 +711,7 @@ open_or_create_data_files(
/*======================*/
ibool* create_new_db, /*!< out: TRUE if new database should be
created */
+ ibool* create_new_doublewrite_file,
#ifdef UNIV_LOG_ARCHIVE
ulint* min_arch_log_no,/*!< out: min of archived log
numbers in data files */
@@ -760,6 +744,7 @@ open_or_create_data_files(
*sum_of_new_sizes = 0;
*create_new_db = FALSE;
+ *create_new_doublewrite_file = FALSE;
srv_normalize_path_for_win(srv_data_home);
@@ -992,6 +977,142 @@ skip_size_check:
srv_data_file_is_raw_partition[i] != 0);
}
+ /* special file for doublewrite buffer */
+ if (srv_doublewrite_file)
+ {
+ srv_normalize_path_for_win(srv_doublewrite_file);
+
+ fprintf(stderr,
+ "InnoDB: Notice: innodb_doublewrite_file is specified.\n"
+ "InnoDB: This is for expert only. Don't use if you don't understand what is it 'WELL'.\n"
+ "InnoDB: ### Don't specify older file than the last checkpoint ###\n"
+ "InnoDB: otherwise the older doublewrite buffer will break your data during recovery!\n");
+
+ strcpy(name, srv_doublewrite_file);
+
+ /* First we try to create the file: if it already
+ exists, ret will get value FALSE */
+
+ files[i] = os_file_create(name, OS_FILE_CREATE,
+ OS_FILE_NORMAL,
+ OS_DATA_FILE, &ret);
+
+ if (ret == FALSE && os_file_get_last_error(FALSE)
+ != OS_FILE_ALREADY_EXISTS
+#ifdef UNIV_AIX
+ /* AIX 5.1 after security patch ML7 may have
+ errno set to 0 here, which causes our function
+ to return 100; work around that AIX problem */
+ && os_file_get_last_error(FALSE) != 100
+#endif
+ ) {
+ fprintf(stderr,
+ "InnoDB: Error in creating"
+ " or opening %s\n",
+ name);
+
+ return(DB_ERROR);
+ }
+
+ if (ret == FALSE) {
+ /* We open the data file */
+
+ files[i] = os_file_create(
+ name, OS_FILE_OPEN, OS_FILE_NORMAL,
+ OS_DATA_FILE, &ret);
+
+ if (!ret) {
+ fprintf(stderr,
+ "InnoDB: Error in opening %s\n", name);
+ os_file_get_last_error(TRUE);
+
+ return(DB_ERROR);
+ }
+
+ ret = os_file_get_size(files[i], &size, &size_high);
+ ut_a(ret);
+ /* Round size downward to megabytes */
+
+ rounded_size_pages
+ = (size / (1024 * 1024) + 4096 * size_high)
+ << (20 - UNIV_PAGE_SIZE_SHIFT);
+
+ if (rounded_size_pages != TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 9) {
+
+ fprintf(stderr,
+ "InnoDB: Warning: doublewrite buffer file %s"
+ " is of a different size\n"
+ "InnoDB: %lu pages"
+ " (rounded down to MB)\n"
+ "InnoDB: than intended size"
+ " %lu pages...\n",
+ name,
+ (ulong) rounded_size_pages,
+ (ulong) TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 9);
+ }
+
+ fil_read_flushed_lsn_and_arch_log_no(
+ files[i], one_opened,
+#ifdef UNIV_LOG_ARCHIVE
+ min_arch_log_no, max_arch_log_no,
+#endif /* UNIV_LOG_ARCHIVE */
+ min_flushed_lsn, max_flushed_lsn);
+ one_opened = TRUE;
+ } else {
+ /* We created the data file and now write it full of
+ zeros */
+
+ *create_new_doublewrite_file = TRUE;
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Doublewrite buffer file %s did not"
+ " exist: new to be created\n",
+ name);
+
+ if (*create_new_db == FALSE) {
+ fprintf(stderr,
+ "InnoDB: Warning: Previous version's ibdata files may cause crash.\n"
+ " If you use that, please use the ibdata files of this version.\n");
+ }
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Setting file %s size to %lu MB\n",
+ name,
+ (ulong) ((TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 9)
+ >> (20 - UNIV_PAGE_SIZE_SHIFT)));
+
+ fprintf(stderr,
+ "InnoDB: Database physically writes the"
+ " file full: wait...\n");
+
+ ret = os_file_set_size(
+ name, files[i],
+ srv_calc_low32(TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 9),
+ srv_calc_high32(TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 9));
+
+ if (!ret) {
+ fprintf(stderr,
+ "InnoDB: Error in creating %s:"
+ " probably out of disk space\n", name);
+
+ return(DB_ERROR);
+ }
+ }
+
+ ret = os_file_close(files[i]);
+ ut_a(ret);
+
+ fil_space_create(name, TRX_DOUBLEWRITE_SPACE, 0, FIL_TABLESPACE);
+
+ ut_a(fil_validate());
+
+ fil_node_create(name, TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 9, TRX_DOUBLEWRITE_SPACE, FALSE);
+
+ i++;
+ }
+
ios = 0;
mutex_create(&ios_mutex, SYNC_NO_ORDER_CHECK);
@@ -1010,6 +1131,7 @@ innobase_start_or_create_for_mysql(void)
{
buf_pool_t* ret;
ibool create_new_db;
+ ibool create_new_doublewrite_file;
ibool log_file_created;
ibool log_created = FALSE;
ibool log_opened = FALSE;
@@ -1074,7 +1196,11 @@ innobase_start_or_create_for_mysql(void)
#ifdef UNIV_IBUF_DEBUG
fprintf(stderr,
"InnoDB: !!!!!!!! UNIV_IBUF_DEBUG switched on !!!!!!!!!\n"
- "InnoDB: Crash recovery will fail with UNIV_IBUF_DEBUG\n");
+# ifdef UNIV_IBUF_COUNT_DEBUG
+ "InnoDB: !!!!!!!! UNIV_IBUF_COUNT_DEBUG switched on !!!!!!!!!\n"
+ "InnoDB: Crash recovery will fail with UNIV_IBUF_COUNT_DEBUG\n"
+# endif
+ );
#endif
#ifdef UNIV_SYNC_DEBUG
@@ -1101,7 +1227,15 @@ innobase_start_or_create_for_mysql(void)
"InnoDB: The InnoDB memory heap is disabled\n");
}
- fprintf(stderr, "InnoDB: %s\n", IB_ATOMICS_STARTUP_MSG);
+ fputs("InnoDB: " IB_ATOMICS_STARTUP_MSG
+ "\nInnoDB: Compressed tables use zlib " ZLIB_VERSION
+#ifdef UNIV_ZIP_DEBUG
+ " with validation"
+#endif /* UNIV_ZIP_DEBUG */
+#ifdef UNIV_ZIP_COPY
+ " and extra copying"
+#endif /* UNIV_ZIP_COPY */
+ "\n" , stderr);
/* Since InnoDB does not currently clean up all its internal data
structures in MySQL Embedded Server Library server_end(), we
@@ -1388,6 +1522,7 @@ innobase_start_or_create_for_mysql(void)
}
err = open_or_create_data_files(&create_new_db,
+ &create_new_doublewrite_file,
#ifdef UNIV_LOG_ARCHIVE
&min_arch_log_no, &max_arch_log_no,
#endif /* UNIV_LOG_ARCHIVE */
@@ -1504,6 +1639,14 @@ innobase_start_or_create_for_mysql(void)
trx_sys_file_format_init();
+ if (create_new_doublewrite_file) {
+ mtr_start(&mtr);
+ fsp_header_init(TRX_DOUBLEWRITE_SPACE, TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 9, &mtr);
+ mtr_commit(&mtr);
+
+ trx_sys_dummy_create(TRX_DOUBLEWRITE_SPACE);
+ }
+
if (create_new_db) {
mtr_start(&mtr);
fsp_header_init(0, sum_of_new_sizes, &mtr);
@@ -1596,6 +1739,14 @@ innobase_start_or_create_for_mysql(void)
dict_boot();
trx_sys_init_at_db_start();
+ /* Initialize the fsp free limit global variable in the log
+ system */
+ fsp_header_get_free_limit();
+
+ /* recv_recovery_from_checkpoint_finish needs trx lists which
+ are initialized in trx_sys_init_at_db_start(). */
+
+ recv_recovery_from_checkpoint_finish();
if (srv_force_recovery < SRV_FORCE_NO_IBUF_MERGE) {
/* The following call is necessary for the insert
buffer to work with multiple tablespaces. We must
@@ -1611,26 +1762,14 @@ innobase_start_or_create_for_mysql(void)
every table in the InnoDB data dictionary that has
an .ibd file.
- We also determine the maximum tablespace id used.
-
- TODO: We may have incomplete transactions in the
- data dictionary tables. Does that harm the scanning of
- the data dictionary below? */
+ We also determine the maximum tablespace id used. */
dict_check_tablespaces_and_store_max_id(
recv_needed_recovery);
}
srv_startup_is_before_trx_rollback_phase = FALSE;
-
- /* Initialize the fsp free limit global variable in the log
- system */
- fsp_header_get_free_limit();
-
- /* recv_recovery_from_checkpoint_finish needs trx lists which
- are initialized in trx_sys_init_at_db_start(). */
-
- recv_recovery_from_checkpoint_finish();
+ recv_recovery_rollback_active();
/* It is possible that file_format tag has never
been set. In this case we initialize it to minimum
@@ -1679,15 +1818,18 @@ innobase_start_or_create_for_mysql(void)
/* fprintf(stderr, "Max allowed record size %lu\n",
page_get_free_space_of_empty() / 2); */
- /* Create the thread which watches the timeouts for lock waits
- and prints InnoDB monitor info */
-
- os_thread_create(&srv_lock_timeout_and_monitor_thread, NULL,
+ /* Create the thread which watches the timeouts for lock waits */
+ os_thread_create(&srv_lock_timeout_thread, NULL,
thread_ids + 2 + SRV_MAX_N_IO_THREADS);
/* Create the thread which warns of long semaphore waits */
os_thread_create(&srv_error_monitor_thread, NULL,
thread_ids + 3 + SRV_MAX_N_IO_THREADS);
+
+ /* Create the thread which prints InnoDB monitor info */
+ os_thread_create(&srv_monitor_thread, NULL,
+ thread_ids + 4 + SRV_MAX_N_IO_THREADS);
+
srv_is_being_started = FALSE;
if (trx_doublewrite == NULL) {
@@ -1712,13 +1854,13 @@ innobase_start_or_create_for_mysql(void)
ulint i;
os_thread_create(&srv_purge_thread, NULL, thread_ids
- + (4 + SRV_MAX_N_IO_THREADS));
+ + (5 + SRV_MAX_N_IO_THREADS));
for (i = 0; i < srv_use_purge_thread - 1; i++) {
- n[5 + i + SRV_MAX_N_IO_THREADS] = i; /* using as index for arrays in purge_sys */
+ n[6 + i + SRV_MAX_N_IO_THREADS] = i; /* using as index for arrays in purge_sys */
os_thread_create(&srv_purge_worker_thread,
- n + (5 + i + SRV_MAX_N_IO_THREADS),
- thread_ids + (5 + i + SRV_MAX_N_IO_THREADS));
+ n + (6 + i + SRV_MAX_N_IO_THREADS),
+ thread_ids + (6 + i + SRV_MAX_N_IO_THREADS));
}
}
#ifdef UNIV_DEBUG
@@ -1821,7 +1963,7 @@ innobase_start_or_create_for_mysql(void)
if (srv_print_verbose_log) {
ut_print_timestamp(stderr);
fprintf(stderr,
- " InnoDB Plugin %s started; "
+ " Percona XtraDB (http://www.percona.com) %s started; "
"log sequence number %llu\n",
INNODB_VERSION_STR, srv_start_lsn);
}
diff --git a/storage/xtradb/sync/sync0sync.c b/storage/xtradb/sync/sync0sync.c
index c0e543f284d..fe14cbf0886 100644
--- a/storage/xtradb/sync/sync0sync.c
+++ b/storage/xtradb/sync/sync0sync.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
Copyright (c) 2008, Google Inc.
Portions of this file contain modifications contributed and copyrighted by
@@ -317,6 +317,15 @@ mutex_free(
ut_a(mutex_get_lock_word(mutex) == 0);
ut_a(mutex_get_waiters(mutex) == 0);
+#ifdef UNIV_MEM_DEBUG
+ if (mutex == &mem_hash_mutex) {
+ ut_ad(UT_LIST_GET_LEN(mutex_list) == 1);
+ ut_ad(UT_LIST_GET_FIRST(mutex_list) == &mem_hash_mutex);
+ UT_LIST_REMOVE(list, mutex_list, mutex);
+ goto func_exit;
+ }
+#endif /* UNIV_MEM_DEBUG */
+
if (mutex != &mutex_list_mutex
#ifdef UNIV_SYNC_DEBUG
&& mutex != &sync_thread_mutex
@@ -338,7 +347,9 @@ mutex_free(
}
os_event_free(mutex->event);
-
+#ifdef UNIV_MEM_DEBUG
+func_exit:
+#endif /* UNIV_MEM_DEBUG */
#if !defined(HAVE_ATOMIC_BUILTINS)
os_fast_mutex_free(&(mutex->os_fast_mutex));
#endif
@@ -959,12 +970,62 @@ sync_thread_levels_contain(
}
/******************************************************************//**
+Checks if the level array for the current thread contains a
+mutex or rw-latch at the specified level.
+@return a matching latch, or NULL if not found */
+UNIV_INTERN
+void*
+sync_thread_levels_contains(
+/*========================*/
+ ulint level) /*!< in: latching order level
+ (SYNC_DICT, ...)*/
+{
+ sync_level_t* arr;
+ sync_thread_t* thread_slot;
+ sync_level_t* slot;
+ ulint i;
+
+ if (!sync_order_checks_on) {
+
+ return(NULL);
+ }
+
+ mutex_enter(&sync_thread_mutex);
+
+ thread_slot = sync_thread_level_arrays_find_slot();
+
+ if (thread_slot == NULL) {
+
+ mutex_exit(&sync_thread_mutex);
+
+ return(NULL);
+ }
+
+ arr = thread_slot->levels;
+
+ for (i = 0; i < SYNC_THREAD_N_LEVELS; i++) {
+
+ slot = sync_thread_levels_get_nth(arr, i);
+
+ if (slot->latch != NULL && slot->level == level) {
+
+ mutex_exit(&sync_thread_mutex);
+ return(slot->latch);
+ }
+ }
+
+ mutex_exit(&sync_thread_mutex);
+
+ return(NULL);
+}
+
+/******************************************************************//**
Checks that the level array for the current thread is empty.
-@return TRUE if empty except the exceptions specified below */
+@return a latch, or NULL if empty except the exceptions specified below */
UNIV_INTERN
-ibool
-sync_thread_levels_empty_gen(
-/*=========================*/
+void*
+sync_thread_levels_nonempty_gen(
+/*============================*/
ibool dict_mutex_allowed) /*!< in: TRUE if dictionary mutex is
allowed to be owned by the thread,
also purge_is_running mutex is
@@ -977,7 +1038,7 @@ sync_thread_levels_empty_gen(
if (!sync_order_checks_on) {
- return(TRUE);
+ return(NULL);
}
mutex_enter(&sync_thread_mutex);
@@ -988,7 +1049,7 @@ sync_thread_levels_empty_gen(
mutex_exit(&sync_thread_mutex);
- return(TRUE);
+ return(NULL);
}
arr = thread_slot->levels;
@@ -1005,13 +1066,13 @@ sync_thread_levels_empty_gen(
mutex_exit(&sync_thread_mutex);
ut_error;
- return(FALSE);
+ return(slot->latch);
}
}
mutex_exit(&sync_thread_mutex);
- return(TRUE);
+ return(NULL);
}
/******************************************************************//**
@@ -1388,6 +1449,12 @@ sync_close(void)
mutex = UT_LIST_GET_FIRST(mutex_list);
while (mutex) {
+#ifdef UNIV_MEM_DEBUG
+ if (mutex == &mem_hash_mutex) {
+ mutex = UT_LIST_GET_NEXT(list, mutex);
+ continue;
+ }
+#endif /* UNIV_MEM_DEBUG */
mutex_free(mutex);
mutex = UT_LIST_GET_FIRST(mutex_list);
}
diff --git a/storage/xtradb/trx/trx0i_s.c b/storage/xtradb/trx/trx0i_s.c
index 78b20edd365..c160eb2942a 100644
--- a/storage/xtradb/trx/trx0i_s.c
+++ b/storage/xtradb/trx/trx0i_s.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 2007, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2007, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -28,11 +28,18 @@ table cache" for later retrieval.
Created July 17, 2007 Vasil Dimov
*******************************************************/
+/* Found during the build of 5.5.3 on Linux 2.4 and early 2.6 kernels:
+ The includes "univ.i" -> "my_global.h" cause a different path
+ to be taken further down with pthread functions and types,
+ so they must come first.
+ From the symptoms, this is related to bug#46587 in the MySQL bug DB.
+*/
+#include "univ.i"
+
+#include <mysql/plugin.h>
#include "mysql_addons.h"
-#include "univ.i"
-#include <mysql/plugin.h>
#include "buf0buf.h"
#include "dict0dict.h"
#include "ha0storage.h"
diff --git a/storage/xtradb/trx/trx0rec.c b/storage/xtradb/trx/trx0rec.c
index 5097cf18dcd..f50e10ed756 100644
--- a/storage/xtradb/trx/trx0rec.c
+++ b/storage/xtradb/trx/trx0rec.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -350,8 +350,13 @@ trx_undo_rec_get_col_val(
ut_ad(*orig_len >= BTR_EXTERN_FIELD_REF_SIZE);
ut_ad(*len > *orig_len);
- ut_ad(*len >= REC_MAX_INDEX_COL_LEN
+ /* @see dtuple_convert_big_rec() */
+ ut_ad(*len >= BTR_EXTERN_FIELD_REF_SIZE * 2);
+ /* we do not have access to index->table here
+ ut_ad(dict_table_get_format(index->table) >= DICT_TF_FORMAT_ZIP
+ || *len >= REC_MAX_INDEX_COL_LEN
+ BTR_EXTERN_FIELD_REF_SIZE);
+ */
*len += UNIV_EXTERN_STORAGE_FIELD;
break;
@@ -977,6 +982,7 @@ trx_undo_update_rec_get_update(
fprintf(stderr, "\n"
"InnoDB: n_fields = %lu, i = %lu, ptr %p\n",
(ulong) n_fields, (ulong) i, ptr);
+ *upd = NULL;
return(NULL);
}
@@ -1074,11 +1080,15 @@ trx_undo_rec_get_partial_row(
/* If the prefix of this column is indexed,
ensure that enough prefix is stored in the
undo log record. */
- ut_a(ignore_prefix
- || !col->ord_part
- || dfield_get_len(dfield)
- >= REC_MAX_INDEX_COL_LEN
- + BTR_EXTERN_FIELD_REF_SIZE);
+ if (!ignore_prefix && col->ord_part) {
+ ut_a(dfield_get_len(dfield)
+ >= 2 * BTR_EXTERN_FIELD_REF_SIZE);
+ ut_a(dict_table_get_format(index->table)
+ >= DICT_TF_FORMAT_ZIP
+ || dfield_get_len(dfield)
+ >= REC_MAX_INDEX_COL_LEN
+ + BTR_EXTERN_FIELD_REF_SIZE);
+ }
}
}
diff --git a/storage/xtradb/trx/trx0rseg.c b/storage/xtradb/trx/trx0rseg.c
index 8d754788e2a..57b5611d624 100644
--- a/storage/xtradb/trx/trx0rseg.c
+++ b/storage/xtradb/trx/trx0rseg.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
diff --git a/storage/xtradb/trx/trx0sys.c b/storage/xtradb/trx/trx0sys.c
index 8ea34a8c81c..7f50973d65e 100644
--- a/storage/xtradb/trx/trx0sys.c
+++ b/storage/xtradb/trx/trx0sys.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -402,6 +402,149 @@ start_again:
goto start_again;
}
+
+ if (srv_doublewrite_file) {
+ /* the same doublewrite buffer to TRX_SYS_SPACE should exist.
+ check and create if not exist.*/
+
+ mtr_start(&mtr);
+ trx_doublewrite_buf_is_being_created = TRUE;
+
+ block = buf_page_get(TRX_DOUBLEWRITE_SPACE, 0, TRX_SYS_PAGE_NO,
+ RW_X_LATCH, &mtr);
+ buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
+
+ doublewrite = buf_block_get_frame(block) + TRX_SYS_DOUBLEWRITE;
+
+ if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC)
+ == TRX_SYS_DOUBLEWRITE_MAGIC_N) {
+ /* The doublewrite buffer has already been created:
+ just read in some numbers */
+
+ mtr_commit(&mtr);
+ } else {
+ fprintf(stderr,
+ "InnoDB: Doublewrite buffer not found in the doublewrite file:"
+ " creating new\n");
+
+ if (buf_pool_get_curr_size()
+ < ((2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE
+ + FSP_EXTENT_SIZE / 2 + 100)
+ * UNIV_PAGE_SIZE)) {
+ fprintf(stderr,
+ "InnoDB: Cannot create doublewrite buffer:"
+ " you must\n"
+ "InnoDB: increase your buffer pool size.\n"
+ "InnoDB: Cannot continue operation.\n");
+
+ exit(1);
+ }
+
+ block2 = fseg_create(TRX_DOUBLEWRITE_SPACE, TRX_SYS_PAGE_NO,
+ TRX_SYS_DOUBLEWRITE
+ + TRX_SYS_DOUBLEWRITE_FSEG, &mtr);
+
+ /* fseg_create acquires a second latch on the page,
+ therefore we must declare it: */
+
+ buf_block_dbg_add_level(block2, SYNC_NO_ORDER_CHECK);
+
+ if (block2 == NULL) {
+ fprintf(stderr,
+ "InnoDB: Cannot create doublewrite buffer:"
+ " you must\n"
+ "InnoDB: increase your tablespace size.\n"
+ "InnoDB: Cannot continue operation.\n");
+
+ /* We exit without committing the mtr to prevent
+ its modifications to the database getting to disk */
+
+ exit(1);
+ }
+
+ fseg_header = buf_block_get_frame(block)
+ + TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_FSEG;
+ prev_page_no = 0;
+
+ for (i = 0; i < 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE
+ + FSP_EXTENT_SIZE / 2; i++) {
+ page_no = fseg_alloc_free_page(fseg_header,
+ prev_page_no + 1,
+ FSP_UP, &mtr);
+ if (page_no == FIL_NULL) {
+ fprintf(stderr,
+ "InnoDB: Cannot create doublewrite"
+ " buffer: you must\n"
+ "InnoDB: increase your"
+ " tablespace size.\n"
+ "InnoDB: Cannot continue operation.\n"
+ );
+
+ exit(1);
+ }
+
+ /* We read the allocated pages to the buffer pool;
+ when they are written to disk in a flush, the space
+ id and page number fields are also written to the
+ pages. When we at database startup read pages
+ from the doublewrite buffer, we know that if the
+ space id and page number in them are the same as
+ the page position in the tablespace, then the page
+ has not been written to in doublewrite. */
+
+ new_block = buf_page_get(TRX_DOUBLEWRITE_SPACE, 0, page_no,
+ RW_X_LATCH, &mtr);
+ buf_block_dbg_add_level(new_block,
+ SYNC_NO_ORDER_CHECK);
+
+ if (i == FSP_EXTENT_SIZE / 2) {
+ ut_a(page_no == FSP_EXTENT_SIZE);
+ mlog_write_ulint(doublewrite
+ + TRX_SYS_DOUBLEWRITE_BLOCK1,
+ page_no, MLOG_4BYTES, &mtr);
+ mlog_write_ulint(doublewrite
+ + TRX_SYS_DOUBLEWRITE_REPEAT
+ + TRX_SYS_DOUBLEWRITE_BLOCK1,
+ page_no, MLOG_4BYTES, &mtr);
+ } else if (i == FSP_EXTENT_SIZE / 2
+ + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
+ ut_a(page_no == 2 * FSP_EXTENT_SIZE);
+ mlog_write_ulint(doublewrite
+ + TRX_SYS_DOUBLEWRITE_BLOCK2,
+ page_no, MLOG_4BYTES, &mtr);
+ mlog_write_ulint(doublewrite
+ + TRX_SYS_DOUBLEWRITE_REPEAT
+ + TRX_SYS_DOUBLEWRITE_BLOCK2,
+ page_no, MLOG_4BYTES, &mtr);
+ } else if (i > FSP_EXTENT_SIZE / 2) {
+ ut_a(page_no == prev_page_no + 1);
+ }
+
+ prev_page_no = page_no;
+ }
+
+ mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC,
+ TRX_SYS_DOUBLEWRITE_MAGIC_N,
+ MLOG_4BYTES, &mtr);
+ mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC
+ + TRX_SYS_DOUBLEWRITE_REPEAT,
+ TRX_SYS_DOUBLEWRITE_MAGIC_N,
+ MLOG_4BYTES, &mtr);
+
+ mlog_write_ulint(doublewrite
+ + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED,
+ TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N,
+ MLOG_4BYTES, &mtr);
+ mtr_commit(&mtr);
+
+ /* Flush the modified pages to disk and make a checkpoint */
+ log_make_checkpoint_at(IB_ULONGLONG_MAX, TRUE);
+
+ fprintf(stderr, "InnoDB: Doublewrite buffer created in the doublewrite file\n");
+ }
+
+ trx_doublewrite_buf_is_being_created = FALSE;
+ }
}
/****************************************************************//**
@@ -425,10 +568,19 @@ trx_sys_doublewrite_init_or_restore_pages(
ulint source_page_no;
byte* page;
byte* doublewrite;
+ ulint doublewrite_space_id;
ulint space_id;
ulint page_no;
ulint i;
+ doublewrite_space_id = (srv_doublewrite_file ? TRX_DOUBLEWRITE_SPACE : TRX_SYS_SPACE);
+
+ if (srv_doublewrite_file) {
+ fprintf(stderr,
+ "InnoDB: doublewrite file '%s' is used.\n",
+ srv_doublewrite_file);
+ }
+
/* We do the file i/o past the buffer pool */
unaligned_read_buf = ut_malloc(2 * UNIV_PAGE_SIZE);
@@ -437,7 +589,7 @@ trx_sys_doublewrite_init_or_restore_pages(
/* Read the trx sys header to check if we are using the doublewrite
buffer */
- fil_io(OS_FILE_READ, TRUE, TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO, 0,
+ fil_io(OS_FILE_READ, TRUE, doublewrite_space_id, 0, TRX_SYS_PAGE_NO, 0,
UNIV_PAGE_SIZE, read_buf, NULL);
doublewrite = read_buf + TRX_SYS_DOUBLEWRITE;
@@ -475,10 +627,10 @@ trx_sys_doublewrite_init_or_restore_pages(
/* Read the pages from the doublewrite buffer to memory */
- fil_io(OS_FILE_READ, TRUE, TRX_SYS_SPACE, 0, block1, 0,
+ fil_io(OS_FILE_READ, TRUE, doublewrite_space_id, 0, block1, 0,
TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE,
buf, NULL);
- fil_io(OS_FILE_READ, TRUE, TRX_SYS_SPACE, 0, block2, 0,
+ fil_io(OS_FILE_READ, TRUE, doublewrite_space_id, 0, block2, 0,
TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE,
buf + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE,
NULL);
@@ -534,7 +686,8 @@ trx_sys_doublewrite_init_or_restore_pages(
" doublewrite buf.\n",
(ulong) space_id, (ulong) page_no, (ulong) i);
- } else if (space_id == TRX_SYS_SPACE
+ } else if ((space_id == TRX_SYS_SPACE
+ || (srv_doublewrite_file && space_id == TRX_DOUBLEWRITE_SPACE))
&& ((page_no >= block1
&& page_no
< block1 + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)
@@ -594,8 +747,8 @@ trx_sys_doublewrite_init_or_restore_pages(
" recover the database"
" with the my.cnf\n"
"InnoDB: option:\n"
- "InnoDB: set-variable="
- "innodb_force_recovery=6\n");
+ "InnoDB:"
+ " innodb_force_recovery=6\n");
exit(1);
}
@@ -982,6 +1135,87 @@ trx_sysf_create(
}
/*****************************************************************//**
+Creates dummy of the file page for the transaction system. */
+static
+void
+trx_sysf_dummy_create(
+/*==================*/
+ ulint space,
+ mtr_t* mtr)
+{
+ trx_sysf_t* sys_header;
+ ulint slot_no;
+ buf_block_t* block;
+ page_t* page;
+ ulint page_no;
+ ulint i;
+
+ ut_ad(mtr);
+
+ /* Note that below we first reserve the file space x-latch, and
+ then enter the kernel: we must do it in this order to conform
+ to the latching order rules. */
+
+ mtr_x_lock(fil_space_get_latch(space, NULL), mtr);
+ mutex_enter(&kernel_mutex);
+
+ /* Create the trx sys file block in a new allocated file segment */
+ block = fseg_create(space, 0, TRX_SYS + TRX_SYS_FSEG_HEADER,
+ mtr);
+ buf_block_dbg_add_level(block, SYNC_TRX_SYS_HEADER);
+
+ fprintf(stderr, "%lu\n", buf_block_get_page_no(block));
+ ut_a(buf_block_get_page_no(block) == TRX_SYS_PAGE_NO);
+
+ page = buf_block_get_frame(block);
+
+ mlog_write_ulint(page + FIL_PAGE_TYPE, FIL_PAGE_TYPE_TRX_SYS,
+ MLOG_2BYTES, mtr);
+
+ /* Reset the doublewrite buffer magic number to zero so that we
+ know that the doublewrite buffer has not yet been created (this
+ suppresses a Valgrind warning) */
+
+ mlog_write_ulint(page + TRX_SYS_DOUBLEWRITE
+ + TRX_SYS_DOUBLEWRITE_MAGIC, 0, MLOG_4BYTES, mtr);
+
+#ifdef UNDEFINED
+ /* TODO: REMOVE IT: The bellow is not needed, I think */
+ sys_header = trx_sysf_get(mtr);
+
+ /* Start counting transaction ids from number 1 up */
+ mlog_write_dulint(sys_header + TRX_SYS_TRX_ID_STORE,
+ ut_dulint_create(0, 1), mtr);
+
+ /* Reset the rollback segment slots */
+ for (i = 0; i < TRX_SYS_N_RSEGS; i++) {
+
+ trx_sysf_rseg_set_space(sys_header, i, ULINT_UNDEFINED, mtr);
+ trx_sysf_rseg_set_page_no(sys_header, i, FIL_NULL, mtr);
+ }
+
+ /* The remaining area (up to the page trailer) is uninitialized.
+ Silence Valgrind warnings about it. */
+ UNIV_MEM_VALID(sys_header + (TRX_SYS_RSEGS
+ + TRX_SYS_N_RSEGS * TRX_SYS_RSEG_SLOT_SIZE
+ + TRX_SYS_RSEG_SPACE),
+ (UNIV_PAGE_SIZE - FIL_PAGE_DATA_END
+ - (TRX_SYS_RSEGS
+ + TRX_SYS_N_RSEGS * TRX_SYS_RSEG_SLOT_SIZE
+ + TRX_SYS_RSEG_SPACE))
+ + page - sys_header);
+
+ /* Create the first rollback segment in the SYSTEM tablespace */
+ page_no = trx_rseg_header_create(space, 0, ULINT_MAX, &slot_no,
+ mtr);
+ ut_a(slot_no == TRX_SYS_SYSTEM_RSEG_ID);
+ ut_a(page_no != FIL_NULL);
+#endif
+
+ mutex_exit(&kernel_mutex);
+}
+
+/*****************************************************************//**
Creates and initializes the central memory structures for the transaction
system. This is called when the database is started. */
UNIV_INTERN
@@ -1087,6 +1321,26 @@ trx_sys_create(void)
trx_sys_init_at_db_start();
}
+/*****************************************************************//**
+Creates and initializes the dummy transaction system page for tablespace. */
+UNIV_INTERN
+void
+trx_sys_dummy_create(
+/*=================*/
+ ulint space)
+{
+ mtr_t mtr;
+
+ /* This function is only for doublewrite file for now */
+ ut_a(space == TRX_DOUBLEWRITE_SPACE);
+
+ mtr_start(&mtr);
+
+ trx_sysf_dummy_create(space, &mtr);
+
+ mtr_commit(&mtr);
+}
+
/*********************************************************************
Create extra rollback segments when create_new_db */
UNIV_INTERN
@@ -1608,6 +1862,7 @@ trx_sys_file_format_id_to_name(
#endif /* !UNIV_HOTBACKUP */
+#ifndef UNIV_HOTBACKUP
/*********************************************************************
Shutdown/Close the transaction system. */
UNIV_INTERN
@@ -1684,3 +1939,4 @@ trx_sys_close(void)
trx_sys = NULL;
mutex_exit(&kernel_mutex);
}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/trx/trx0trx.c b/storage/xtradb/trx/trx0trx.c
index e81daf4cad9..f150d64f8dc 100644
--- a/storage/xtradb/trx/trx0trx.c
+++ b/storage/xtradb/trx/trx0trx.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -455,6 +455,7 @@ trx_lists_init_at_db_start(void)
trx_undo_t* undo;
trx_t* trx;
+ ut_ad(mutex_own(&kernel_mutex));
UT_LIST_INIT(trx_sys->trx_list);
/* Look from the rollback segments if there exist undo logs for
@@ -885,7 +886,7 @@ trx_commit_off_kernel(
recovery i.e.: back ground rollback thread is still active
then there is a chance that the rollback thread may see
this trx as COMMITTED_IN_MEMORY and goes adhead to clean it
- up calling trx_cleanup_at_db_startup(). This can happen
+ up calling trx_cleanup_at_db_startup(). This can happen
in the case we are committing a trx here that is left in
PREPARED state during the crash. Note that commit of the
rollback of a PREPARED trx happens in the recovery thread
diff --git a/storage/xtradb/ut/ut0rbt.c b/storage/xtradb/ut/ut0rbt.c
new file mode 100644
index 00000000000..3d7bc91e714
--- /dev/null
+++ b/storage/xtradb/ut/ut0rbt.c
@@ -0,0 +1,1249 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file ut/ut0rbt.c
+Red-Black tree implementation
+
+Created 2007-03-20 Sunny Bains
+***********************************************************************/
+
+#include "ut0rbt.h"
+
+/************************************************************************
+Definition of a red-black tree
+==============================
+
+A red-black tree is a binary search tree which has the following
+red-black properties:
+
+ 1. Every node is either red or black.
+ 2. Every leaf (NULL - in our case tree->nil) is black.
+ 3. If a node is red, then both its children are black.
+ 4. Every simple path from a node to a descendant leaf contains the
+ same number of black nodes.
+
+ from (3) above, the implication is that on any path from the root
+ to a leaf, red nodes must not be adjacent.
+
+ However, any number of black nodes may appear in a sequence. */
+
+#if defined(IB_RBT_TESTING)
+#warning "Testing enabled!"
+#endif
+
+#define ROOT(t) (t->root->left)
+#define SIZEOF_NODE(t) ((sizeof(ib_rbt_node_t) + t->sizeof_value) - 1)
+
+/****************************************************************//**
+Print out the sub-tree recursively. */
+static
+void
+rbt_print_subtree(
+/*==============*/
+ const ib_rbt_t* tree, /*!< in: tree to traverse */
+ const ib_rbt_node_t* node, /*!< in: node to print */
+ ib_rbt_print_node print) /*!< in: print key function */
+{
+ /* FIXME: Doesn't do anything yet */
+ if (node != tree->nil) {
+ print(node);
+ rbt_print_subtree(tree, node->left, print);
+ rbt_print_subtree(tree, node->right, print);
+ }
+}
+
+/****************************************************************//**
+Verify that the keys are in order.
+@return TRUE of OK. FALSE if not ordered */
+static
+ibool
+rbt_check_ordering(
+/*===============*/
+ const ib_rbt_t* tree) /*!< in: tree to verfify */
+{
+ const ib_rbt_node_t* node;
+ const ib_rbt_node_t* prev = NULL;
+
+ /* Iterate over all the nodes, comparing each node with the prev */
+ for (node = rbt_first(tree); node; node = rbt_next(tree, prev)) {
+
+ if (prev && tree->compare(prev->value, node->value) >= 0) {
+ return(FALSE);
+ }
+
+ prev = node;
+ }
+
+ return(TRUE);
+}
+
+/****************************************************************//**
+Check that every path from the root to the leaves has the same count.
+Count is expressed in the number of black nodes.
+@return 0 on failure else black height of the subtree */
+static
+ibool
+rbt_count_black_nodes(
+/*==================*/
+ const ib_rbt_t* tree, /*!< in: tree to verify */
+ const ib_rbt_node_t* node) /*!< in: start of sub-tree */
+{
+ ulint result;
+
+ if (node != tree->nil) {
+ ulint left_height = rbt_count_black_nodes(tree, node->left);
+
+ ulint right_height = rbt_count_black_nodes(tree, node->right);
+
+ if (left_height == 0
+ || right_height == 0
+ || left_height != right_height) {
+
+ result = 0;
+ } else if (node->color == IB_RBT_RED) {
+
+ /* Case 3 */
+ if (node->left->color != IB_RBT_BLACK
+ || node->right->color != IB_RBT_BLACK) {
+
+ result = 0;
+ } else {
+ result = left_height;
+ }
+ /* Check if it's anything other than RED or BLACK. */
+ } else if (node->color != IB_RBT_BLACK) {
+
+ result = 0;
+ } else {
+
+ result = right_height + 1;
+ }
+ } else {
+ result = 1;
+ }
+
+ return(result);
+}
+
+/****************************************************************//**
+Turn the node's right child's left sub-tree into node's right sub-tree.
+This will also make node's right child it's parent. */
+static
+void
+rbt_rotate_left(
+/*============*/
+ const ib_rbt_node_t* nil, /*!< in: nil node of the tree */
+ ib_rbt_node_t* node) /*!< in: node to rotate */
+{
+ ib_rbt_node_t* right = node->right;
+
+ node->right = right->left;
+
+ if (right->left != nil) {
+ right->left->parent = node;
+ }
+
+ /* Right's new parent was node's parent. */
+ right->parent = node->parent;
+
+ /* Since root's parent is tree->nil and root->parent->left points
+ back to root, we can avoid the check. */
+ if (node == node->parent->left) {
+ /* Node was on the left of its parent. */
+ node->parent->left = right;
+ } else {
+ /* Node must have been on the right. */
+ node->parent->right = right;
+ }
+
+ /* Finally, put node on right's left. */
+ right->left = node;
+ node->parent = right;
+}
+
+/****************************************************************//**
+Turn the node's left child's right sub-tree into node's left sub-tree.
+This also make node's left child it's parent. */
+static
+void
+rbt_rotate_right(
+/*=============*/
+ const ib_rbt_node_t* nil, /*!< in: nil node of tree */
+ ib_rbt_node_t* node) /*!< in: node to rotate */
+{
+ ib_rbt_node_t* left = node->left;
+
+ node->left = left->right;
+
+ if (left->right != nil) {
+ left->right->parent = node;
+ }
+
+ /* Left's new parent was node's parent. */
+ left->parent = node->parent;
+
+ /* Since root's parent is tree->nil and root->parent->left points
+ back to root, we can avoid the check. */
+ if (node == node->parent->right) {
+ /* Node was on the left of its parent. */
+ node->parent->right = left;
+ } else {
+ /* Node must have been on the left. */
+ node->parent->left = left;
+ }
+
+ /* Finally, put node on left's right. */
+ left->right = node;
+ node->parent = left;
+}
+
+/****************************************************************//**
+Append a node to the tree.
+@return inserted node */
+static
+ib_rbt_node_t*
+rbt_tree_add_child(
+/*===============*/
+ const ib_rbt_t* tree, /*!< in: rbt tree */
+ ib_rbt_bound_t* parent, /*!< in: node's parent */
+ ib_rbt_node_t* node) /*!< in: node to add */
+{
+ /* Cast away the const. */
+ ib_rbt_node_t* last = (ib_rbt_node_t*) parent->last;
+
+ if (last == tree->root || parent->result < 0) {
+ last->left = node;
+ } else {
+ /* FIXME: We don't handle duplicates (yet)! */
+ ut_a(parent->result != 0);
+
+ last->right = node;
+ }
+
+ node->parent = last;
+
+ return(node);
+}
+
+/****************************************************************//**
+Generic binary tree insert
+@return inserted node */
+static
+ib_rbt_node_t*
+rbt_tree_insert(
+/*============*/
+ ib_rbt_t* tree, /*!< in: rb tree */
+ const void* key, /*!< in: key for ordering */
+ ib_rbt_node_t* node) /*!< in: node hold the insert value */
+{
+ ib_rbt_bound_t parent;
+ ib_rbt_node_t* current = ROOT(tree);
+
+ parent.result = 0;
+ parent.last = tree->root;
+
+ /* Regular binary search. */
+ while (current != tree->nil) {
+
+ parent.last = current;
+ parent.result = tree->compare(key, current->value);
+
+ if (parent.result < 0) {
+ current = current->left;
+ } else {
+ current = current->right;
+ }
+ }
+
+ ut_a(current == tree->nil);
+
+ rbt_tree_add_child(tree, &parent, node);
+
+ return(node);
+}
+
+/****************************************************************//**
+Balance a tree after inserting a node. */
+static
+void
+rbt_balance_tree(
+/*=============*/
+ const ib_rbt_t* tree, /*!< in: tree to balance */
+ ib_rbt_node_t* node) /*!< in: node that was inserted */
+{
+ const ib_rbt_node_t* nil = tree->nil;
+ ib_rbt_node_t* parent = node->parent;
+
+ /* Restore the red-black property. */
+ node->color = IB_RBT_RED;
+
+ while (node != ROOT(tree) && parent->color == IB_RBT_RED) {
+ ib_rbt_node_t* grand_parent = parent->parent;
+
+ if (parent == grand_parent->left) {
+ ib_rbt_node_t* uncle = grand_parent->right;
+
+ if (uncle->color == IB_RBT_RED) {
+
+ /* Case 1 - change the colors. */
+ uncle->color = IB_RBT_BLACK;
+ parent->color = IB_RBT_BLACK;
+ grand_parent->color = IB_RBT_RED;
+
+ /* Move node up the tree. */
+ node = grand_parent;
+
+ } else {
+
+ if (node == parent->right) {
+ /* Right is a black node and node is
+ to the right, case 2 - move node
+ up and rotate. */
+ node = parent;
+ rbt_rotate_left(nil, node);
+ }
+
+ grand_parent = node->parent->parent;
+
+ /* Case 3. */
+ node->parent->color = IB_RBT_BLACK;
+ grand_parent->color = IB_RBT_RED;
+
+ rbt_rotate_right(nil, grand_parent);
+ }
+
+ } else {
+ ib_rbt_node_t* uncle = grand_parent->left;
+
+ if (uncle->color == IB_RBT_RED) {
+
+ /* Case 1 - change the colors. */
+ uncle->color = IB_RBT_BLACK;
+ parent->color = IB_RBT_BLACK;
+ grand_parent->color = IB_RBT_RED;
+
+ /* Move node up the tree. */
+ node = grand_parent;
+
+ } else {
+
+ if (node == parent->left) {
+ /* Left is a black node and node is to
+ the right, case 2 - move node up and
+ rotate. */
+ node = parent;
+ rbt_rotate_right(nil, node);
+ }
+
+ grand_parent = node->parent->parent;
+
+ /* Case 3. */
+ node->parent->color = IB_RBT_BLACK;
+ grand_parent->color = IB_RBT_RED;
+
+ rbt_rotate_left(nil, grand_parent);
+ }
+ }
+
+ parent = node->parent;
+ }
+
+ /* Color the root black. */
+ ROOT(tree)->color = IB_RBT_BLACK;
+}
+
+/****************************************************************//**
+Find the given node's successor.
+@return successor node or NULL if no successor */
+static
+ib_rbt_node_t*
+rbt_find_successor(
+/*===============*/
+ const ib_rbt_t* tree, /*!< in: rb tree */
+ const ib_rbt_node_t* current)/*!< in: this is declared const
+ because it can be called via
+ rbt_next() */
+{
+ const ib_rbt_node_t* nil = tree->nil;
+ ib_rbt_node_t* next = current->right;
+
+ /* Is there a sub-tree to the right that we can follow. */
+ if (next != nil) {
+
+ /* Follow the left most links of the current right child. */
+ while (next->left != nil) {
+ next = next->left;
+ }
+
+ } else { /* We will have to go up the tree to find the successor. */
+ ib_rbt_node_t* parent = current->parent;
+
+ /* Cast away the const. */
+ next = (ib_rbt_node_t*) current;
+
+ while (parent != tree->root && next == parent->right) {
+ next = parent;
+ parent = next->parent;
+ }
+
+ next = (parent == tree->root) ? NULL : parent;
+ }
+
+ return(next);
+}
+
+/****************************************************************//**
+Find the given node's precedecessor.
+@return predecessor node or NULL if no predecesor */
+static
+ib_rbt_node_t*
+rbt_find_predecessor(
+/*=================*/
+ const ib_rbt_t* tree, /*!< in: rb tree */
+ const ib_rbt_node_t* current) /*!< in: this is declared const
+ because it can be called via
+ rbt_prev() */
+{
+ const ib_rbt_node_t* nil = tree->nil;
+ ib_rbt_node_t* prev = current->left;
+
+ /* Is there a sub-tree to the left that we can follow. */
+ if (prev != nil) {
+
+ /* Follow the right most links of the current left child. */
+ while (prev->right != nil) {
+ prev = prev->right;
+ }
+
+ } else { /* We will have to go up the tree to find the precedecessor. */
+ ib_rbt_node_t* parent = current->parent;
+
+ /* Cast away the const. */
+ prev = (ib_rbt_node_t*)current;
+
+ while (parent != tree->root && prev == parent->left) {
+ prev = parent;
+ parent = prev->parent;
+ }
+
+ prev = (parent == tree->root) ? NULL : parent;
+ }
+
+ return(prev);
+}
+
+/****************************************************************//**
+Replace node with child. After applying transformations eject becomes
+an orphan. */
+static
+void
+rbt_eject_node(
+/*===========*/
+ ib_rbt_node_t* eject, /*!< in: node to eject */
+ ib_rbt_node_t* node) /*!< in: node to replace with */
+{
+ /* Update the to be ejected node's parent's child pointers. */
+ if (eject->parent->left == eject) {
+ eject->parent->left = node;
+ } else if (eject->parent->right == eject) {
+ eject->parent->right = node;
+ } else {
+ ut_a(0);
+ }
+ /* eject is now an orphan but otherwise its pointers
+ and color are left intact. */
+
+ node->parent = eject->parent;
+}
+
+/****************************************************************//**
+Replace a node with another node. */
+static
+void
+rbt_replace_node(
+/*=============*/
+ ib_rbt_node_t* replace, /*!< in: node to replace */
+ ib_rbt_node_t* node) /*!< in: node to replace with */
+{
+ ib_rbt_color_t color = node->color;
+
+ /* Update the node pointers. */
+ node->left = replace->left;
+ node->right = replace->right;
+
+ /* Update the child node pointers. */
+ node->left->parent = node;
+ node->right->parent = node;
+
+ /* Make the parent of replace point to node. */
+ rbt_eject_node(replace, node);
+
+ /* Swap the colors. */
+ node->color = replace->color;
+ replace->color = color;
+}
+
+/****************************************************************//**
+Detach node from the tree replacing it with one of it's children.
+@return the child node that now occupies the position of the detached node */
+static
+ib_rbt_node_t*
+rbt_detach_node(
+/*============*/
+ const ib_rbt_t* tree, /*!< in: rb tree */
+ ib_rbt_node_t* node) /*!< in: node to detach */
+{
+ ib_rbt_node_t* child;
+ const ib_rbt_node_t* nil = tree->nil;
+
+ if (node->left != nil && node->right != nil) {
+ /* Case where the node to be deleted has two children. */
+ ib_rbt_node_t* successor = rbt_find_successor(tree, node);
+
+ ut_a(successor != nil);
+ ut_a(successor->parent != nil);
+ ut_a(successor->left == nil);
+
+ child = successor->right;
+
+ /* Remove the successor node and replace with its child. */
+ rbt_eject_node(successor, child);
+
+ /* Replace the node to delete with its successor node. */
+ rbt_replace_node(node, successor);
+ } else {
+ ut_a(node->left == nil || node->right == nil);
+
+ child = (node->left != nil) ? node->left : node->right;
+
+ /* Replace the node to delete with one of it's children. */
+ rbt_eject_node(node, child);
+ }
+
+ /* Reset the node links. */
+ node->parent = node->right = node->left = tree->nil;
+
+ return(child);
+}
+
+/****************************************************************//**
+Rebalance the right sub-tree after deletion.
+@return node to rebalance if more rebalancing required else NULL */
+static
+ib_rbt_node_t*
+rbt_balance_right(
+/*==============*/
+ const ib_rbt_node_t* nil, /*!< in: rb tree nil node */
+ ib_rbt_node_t* parent, /*!< in: parent node */
+ ib_rbt_node_t* sibling)/*!< in: sibling node */
+{
+ ib_rbt_node_t* node = NULL;
+
+ ut_a(sibling != nil);
+
+ /* Case 3. */
+ if (sibling->color == IB_RBT_RED) {
+
+ parent->color = IB_RBT_RED;
+ sibling->color = IB_RBT_BLACK;
+
+ rbt_rotate_left(nil, parent);
+
+ sibling = parent->right;
+
+ ut_a(sibling != nil);
+ }
+
+ /* Since this will violate case 3 because of the change above. */
+ if (sibling->left->color == IB_RBT_BLACK
+ && sibling->right->color == IB_RBT_BLACK) {
+
+ node = parent; /* Parent needs to be rebalanced too. */
+ sibling->color = IB_RBT_RED;
+
+ } else {
+ if (sibling->right->color == IB_RBT_BLACK) {
+
+ ut_a(sibling->left->color == IB_RBT_RED);
+
+ sibling->color = IB_RBT_RED;
+ sibling->left->color = IB_RBT_BLACK;
+
+ rbt_rotate_right(nil, sibling);
+
+ sibling = parent->right;
+ ut_a(sibling != nil);
+ }
+
+ sibling->color = parent->color;
+ sibling->right->color = IB_RBT_BLACK;
+
+ parent->color = IB_RBT_BLACK;
+
+ rbt_rotate_left(nil, parent);
+ }
+
+ return(node);
+}
+
+/****************************************************************//**
+Rebalance the left sub-tree after deletion.
+@return node to rebalance if more rebalancing required else NULL */
+static
+ib_rbt_node_t*
+rbt_balance_left(
+/*=============*/
+ const ib_rbt_node_t* nil, /*!< in: rb tree nil node */
+ ib_rbt_node_t* parent, /*!< in: parent node */
+ ib_rbt_node_t* sibling)/*!< in: sibling node */
+{
+ ib_rbt_node_t* node = NULL;
+
+ ut_a(sibling != nil);
+
+ /* Case 3. */
+ if (sibling->color == IB_RBT_RED) {
+
+ parent->color = IB_RBT_RED;
+ sibling->color = IB_RBT_BLACK;
+
+ rbt_rotate_right(nil, parent);
+ sibling = parent->left;
+
+ ut_a(sibling != nil);
+ }
+
+ /* Since this will violate case 3 because of the change above. */
+ if (sibling->right->color == IB_RBT_BLACK
+ && sibling->left->color == IB_RBT_BLACK) {
+
+ node = parent; /* Parent needs to be rebalanced too. */
+ sibling->color = IB_RBT_RED;
+
+ } else {
+ if (sibling->left->color == IB_RBT_BLACK) {
+
+ ut_a(sibling->right->color == IB_RBT_RED);
+
+ sibling->color = IB_RBT_RED;
+ sibling->right->color = IB_RBT_BLACK;
+
+ rbt_rotate_left(nil, sibling);
+
+ sibling = parent->left;
+
+ ut_a(sibling != nil);
+ }
+
+ sibling->color = parent->color;
+ sibling->left->color = IB_RBT_BLACK;
+
+ parent->color = IB_RBT_BLACK;
+
+ rbt_rotate_right(nil, parent);
+ }
+
+ return(node);
+}
+
+/****************************************************************//**
+Delete the node and rebalance the tree if necessary */
+static
+void
+rbt_remove_node_and_rebalance(
+/*==========================*/
+ ib_rbt_t* tree, /*!< in: rb tree */
+ ib_rbt_node_t* node) /*!< in: node to remove */
+{
+ /* Detach node and get the node that will be used
+ as rebalance start. */
+ ib_rbt_node_t* child = rbt_detach_node(tree, node);
+
+ if (node->color == IB_RBT_BLACK) {
+ ib_rbt_node_t* last = child;
+
+ ROOT(tree)->color = IB_RBT_RED;
+
+ while (child && child->color == IB_RBT_BLACK) {
+ ib_rbt_node_t* parent = child->parent;
+
+ /* Did the deletion cause an imbalance in the
+ parents left sub-tree. */
+ if (parent->left == child) {
+
+ child = rbt_balance_right(
+ tree->nil, parent, parent->right);
+
+ } else if (parent->right == child) {
+
+ child = rbt_balance_left(
+ tree->nil, parent, parent->left);
+
+ } else {
+ ut_error;
+ }
+
+ if (child) {
+ last = child;
+ }
+ }
+
+ ut_a(last);
+
+ last->color = IB_RBT_BLACK;
+ ROOT(tree)->color = IB_RBT_BLACK;
+ }
+
+ /* Note that we have removed a node from the tree. */
+ --tree->n_nodes;
+}
+
+/****************************************************************//**
+Recursively free the nodes. */
+static
+void
+rbt_free_node(
+/*==========*/
+ ib_rbt_node_t* node, /*!< in: node to free */
+ ib_rbt_node_t* nil) /*!< in: rb tree nil node */
+{
+ if (node != nil) {
+ rbt_free_node(node->left, nil);
+ rbt_free_node(node->right, nil);
+
+ ut_free(node);
+ }
+}
+
+/****************************************************************//**
+Free all the nodes and free the tree. */
+UNIV_INTERN
+void
+rbt_free(
+/*=====*/
+ ib_rbt_t* tree) /*!< in: rb tree to free */
+{
+ rbt_free_node(tree->root, tree->nil);
+ ut_free(tree->nil);
+ ut_free(tree);
+}
+
+/****************************************************************//**
+Create an instance of a red black tree.
+@return an empty rb tree */
+UNIV_INTERN
+ib_rbt_t*
+rbt_create(
+/*=======*/
+ size_t sizeof_value, /*!< in: sizeof data item */
+ ib_rbt_compare compare) /*!< in: fn to compare items */
+{
+ ib_rbt_t* tree;
+ ib_rbt_node_t* node;
+
+ tree = (ib_rbt_t*) ut_malloc(sizeof(*tree));
+ memset(tree, 0, sizeof(*tree));
+
+ tree->sizeof_value = sizeof_value;
+
+ /* Create the sentinel (NIL) node. */
+ node = tree->nil = (ib_rbt_node_t*) ut_malloc(sizeof(*node));
+ memset(node, 0, sizeof(*node));
+
+ node->color = IB_RBT_BLACK;
+ node->parent = node->left = node->right = node;
+
+ /* Create the "fake" root, the real root node will be the
+ left child of this node. */
+ node = tree->root = (ib_rbt_node_t*) ut_malloc(sizeof(*node));
+ memset(node, 0, sizeof(*node));
+
+ node->color = IB_RBT_BLACK;
+ node->parent = node->left = node->right = tree->nil;
+
+ tree->compare = compare;
+
+ return(tree);
+}
+
+/****************************************************************//**
+Generic insert of a value in the rb tree.
+@return inserted node */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_insert(
+/*=======*/
+ ib_rbt_t* tree, /*!< in: rb tree */
+ const void* key, /*!< in: key for ordering */
+ const void* value) /*!< in: value of key, this value
+ is copied to the node */
+{
+ ib_rbt_node_t* node;
+
+ /* Create the node that will hold the value data. */
+ node = (ib_rbt_node_t*) ut_malloc(SIZEOF_NODE(tree));
+
+ memcpy(node->value, value, tree->sizeof_value);
+ node->parent = node->left = node->right = tree->nil;
+
+ /* Insert in the tree in the usual way. */
+ rbt_tree_insert(tree, key, node);
+ rbt_balance_tree(tree, node);
+
+ ++tree->n_nodes;
+
+ return(node);
+}
+
+/****************************************************************//**
+Add a new node to the tree, useful for data that is pre-sorted.
+@return appended node */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_add_node(
+/*=========*/
+ ib_rbt_t* tree, /*!< in: rb tree */
+ ib_rbt_bound_t* parent, /*!< in: bounds */
+ const void* value) /*!< in: this value is copied
+ to the node */
+{
+ ib_rbt_node_t* node;
+
+ /* Create the node that will hold the value data */
+ node = (ib_rbt_node_t*) ut_malloc(SIZEOF_NODE(tree));
+
+ memcpy(node->value, value, tree->sizeof_value);
+ node->parent = node->left = node->right = tree->nil;
+
+ /* If tree is empty */
+ if (parent->last == NULL) {
+ parent->last = tree->root;
+ }
+
+ /* Append the node, the hope here is that the caller knows
+ what s/he is doing. */
+ rbt_tree_add_child(tree, parent, node);
+ rbt_balance_tree(tree, node);
+
+ ++tree->n_nodes;
+
+#if defined(IB_RBT_TESTING)
+ ut_a(rbt_validate(tree));
+#endif
+ return(node);
+}
+
+/****************************************************************//**
+Find a matching node in the rb tree.
+@return NULL if not found else the node where key was found */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_lookup(
+/*=======*/
+ const ib_rbt_t* tree, /*!< in: rb tree */
+ const void* key) /*!< in: key to use for search */
+{
+ const ib_rbt_node_t* current = ROOT(tree);
+
+ /* Regular binary search. */
+ while (current != tree->nil) {
+ int result = tree->compare(key, current->value);
+
+ if (result < 0) {
+ current = current->left;
+ } else if (result > 0) {
+ current = current->right;
+ } else {
+ break;
+ }
+ }
+
+ return(current != tree->nil ? current : NULL);
+}
+
+/****************************************************************//**
+Delete a node from the red black tree, identified by key.
+@return TRUE if success FALSE if not found */
+UNIV_INTERN
+ibool
+rbt_delete(
+/*=======*/
+ ib_rbt_t* tree, /*!< in: rb tree */
+ const void* key) /*!< in: key to delete */
+{
+ ibool deleted = FALSE;
+ ib_rbt_node_t* node = (ib_rbt_node_t*) rbt_lookup(tree, key);
+
+ if (node) {
+ rbt_remove_node_and_rebalance(tree, node);
+
+ ut_free(node);
+ deleted = TRUE;
+ }
+
+ return(deleted);
+}
+
+/****************************************************************//**
+Remove a node from the rb tree, the node is not free'd, that is the
+callers responsibility.
+@return deleted node but without the const */
+UNIV_INTERN
+ib_rbt_node_t*
+rbt_remove_node(
+/*============*/
+ ib_rbt_t* tree, /*!< in: rb tree */
+ const ib_rbt_node_t* const_node) /*!< in: node to delete, this
+ is a fudge and declared const
+ because the caller can access
+ only const nodes */
+{
+ /* Cast away the const. */
+ rbt_remove_node_and_rebalance(tree, (ib_rbt_node_t*) const_node);
+
+ /* This is to make it easier to do something like this:
+ ut_free(rbt_remove_node(node));
+ */
+
+ return((ib_rbt_node_t*) const_node);
+}
+
+/****************************************************************//**
+Find the node that has the lowest key that is >= key.
+@return node satisfying the lower bound constraint or NULL */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_lower_bound(
+/*============*/
+ const ib_rbt_t* tree, /*!< in: rb tree */
+ const void* key) /*!< in: key to search */
+{
+ ib_rbt_node_t* lb_node = NULL;
+ ib_rbt_node_t* current = ROOT(tree);
+
+ while (current != tree->nil) {
+ int result = tree->compare(key, current->value);
+
+ if (result > 0) {
+
+ current = current->right;
+
+ } else if (result < 0) {
+
+ lb_node = current;
+ current = current->left;
+
+ } else {
+ lb_node = current;
+ break;
+ }
+ }
+
+ return(lb_node);
+}
+
+/****************************************************************//**
+Find the node that has the greatest key that is <= key.
+@return node satisfying the upper bound constraint or NULL */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_upper_bound(
+/*============*/
+ const ib_rbt_t* tree, /*!< in: rb tree */
+ const void* key) /*!< in: key to search */
+{
+ ib_rbt_node_t* ub_node = NULL;
+ ib_rbt_node_t* current = ROOT(tree);
+
+ while (current != tree->nil) {
+ int result = tree->compare(key, current->value);
+
+ if (result > 0) {
+
+ ub_node = current;
+ current = current->right;
+
+ } else if (result < 0) {
+
+ current = current->left;
+
+ } else {
+ ub_node = current;
+ break;
+ }
+ }
+
+ return(ub_node);
+}
+
+/****************************************************************//**
+Find the node that has the greatest key that is <= key.
+@return value of result */
+UNIV_INTERN
+int
+rbt_search(
+/*=======*/
+ const ib_rbt_t* tree, /*!< in: rb tree */
+ ib_rbt_bound_t* parent, /*!< in: search bounds */
+ const void* key) /*!< in: key to search */
+{
+ ib_rbt_node_t* current = ROOT(tree);
+
+ /* Every thing is greater than the NULL root. */
+ parent->result = 1;
+ parent->last = NULL;
+
+ while (current != tree->nil) {
+
+ parent->last = current;
+ parent->result = tree->compare(key, current->value);
+
+ if (parent->result > 0) {
+ current = current->right;
+ } else if (parent->result < 0) {
+ current = current->left;
+ } else {
+ break;
+ }
+ }
+
+ return(parent->result);
+}
+
+/****************************************************************//**
+Find the node that has the greatest key that is <= key. But use the
+supplied comparison function.
+@return value of result */
+UNIV_INTERN
+int
+rbt_search_cmp(
+/*===========*/
+ const ib_rbt_t* tree, /*!< in: rb tree */
+ ib_rbt_bound_t* parent, /*!< in: search bounds */
+ const void* key, /*!< in: key to search */
+ ib_rbt_compare compare) /*!< in: fn to compare items */
+{
+ ib_rbt_node_t* current = ROOT(tree);
+
+ /* Every thing is greater than the NULL root. */
+ parent->result = 1;
+ parent->last = NULL;
+
+ while (current != tree->nil) {
+
+ parent->last = current;
+ parent->result = compare(key, current->value);
+
+ if (parent->result > 0) {
+ current = current->right;
+ } else if (parent->result < 0) {
+ current = current->left;
+ } else {
+ break;
+ }
+ }
+
+ return(parent->result);
+}
+
+/****************************************************************//**
+Get the leftmost node.
+Return the left most node in the tree. */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_first(
+/*======*/
+ const ib_rbt_t* tree) /* in: rb tree */
+{
+ ib_rbt_node_t* first = NULL;
+ ib_rbt_node_t* current = ROOT(tree);
+
+ while (current != tree->nil) {
+ first = current;
+ current = current->left;
+ }
+
+ return(first);
+}
+
+/****************************************************************//**
+Return the right most node in the tree.
+@return the rightmost node or NULL */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_last(
+/*=====*/
+ const ib_rbt_t* tree) /*!< in: rb tree */
+{
+ ib_rbt_node_t* last = NULL;
+ ib_rbt_node_t* current = ROOT(tree);
+
+ while (current != tree->nil) {
+ last = current;
+ current = current->right;
+ }
+
+ return(last);
+}
+
+/****************************************************************//**
+Return the next node.
+@return node next from current */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_next(
+/*=====*/
+ const ib_rbt_t* tree, /*!< in: rb tree */
+ const ib_rbt_node_t* current)/*!< in: current node */
+{
+ return(current ? rbt_find_successor(tree, current) : NULL);
+}
+
+/****************************************************************//**
+Return the previous node.
+@return node prev from current */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_prev(
+/*=====*/
+ const ib_rbt_t* tree, /*!< in: rb tree */
+ const ib_rbt_node_t* current)/*!< in: current node */
+{
+ return(current ? rbt_find_predecessor(tree, current) : NULL);
+}
+
+/****************************************************************//**
+Reset the tree. Delete all the nodes. */
+UNIV_INTERN
+void
+rbt_clear(
+/*======*/
+ ib_rbt_t* tree) /*!< in: rb tree */
+{
+ rbt_free_node(ROOT(tree), tree->nil);
+
+ tree->n_nodes = 0;
+ tree->root->left = tree->root->right = tree->nil;
+}
+
+/****************************************************************//**
+Merge the node from dst into src. Return the number of nodes merged.
+@return no. of recs merged */
+UNIV_INTERN
+ulint
+rbt_merge_uniq(
+/*===========*/
+ ib_rbt_t* dst, /*!< in: dst rb tree */
+ const ib_rbt_t* src) /*!< in: src rb tree */
+{
+ ib_rbt_bound_t parent;
+ ulint n_merged = 0;
+ const ib_rbt_node_t* src_node = rbt_first(src);
+
+ if (rbt_empty(src) || dst == src) {
+ return(0);
+ }
+
+ for (/* No op */; src_node; src_node = rbt_next(src, src_node)) {
+
+ if (rbt_search(dst, &parent, src_node->value) != 0) {
+ rbt_add_node(dst, &parent, src_node->value);
+ ++n_merged;
+ }
+ }
+
+ return(n_merged);
+}
+
+/****************************************************************//**
+Merge the node from dst into src. Return the number of nodes merged.
+Delete the nodes from src after copying node to dst. As a side effect
+the duplicates will be left untouched in the src.
+@return no. of recs merged */
+UNIV_INTERN
+ulint
+rbt_merge_uniq_destructive(
+/*=======================*/
+ ib_rbt_t* dst, /*!< in: dst rb tree */
+ ib_rbt_t* src) /*!< in: src rb tree */
+{
+ ib_rbt_bound_t parent;
+ ib_rbt_node_t* src_node;
+ ulint old_size = rbt_size(dst);
+
+ if (rbt_empty(src) || dst == src) {
+ return(0);
+ }
+
+ for (src_node = (ib_rbt_node_t*) rbt_first(src); src_node; /* */) {
+ ib_rbt_node_t* prev = src_node;
+
+ src_node = (ib_rbt_node_t*)rbt_next(src, prev);
+
+ /* Skip duplicates. */
+ if (rbt_search(dst, &parent, prev->value) != 0) {
+
+ /* Remove and reset the node but preserve
+ the node (data) value. */
+ rbt_remove_node_and_rebalance(src, prev);
+
+ /* The nil should be taken from the dst tree. */
+ prev->parent = prev->left = prev->right = dst->nil;
+ rbt_tree_add_child(dst, &parent, prev);
+ rbt_balance_tree(dst, prev);
+
+ ++dst->n_nodes;
+ }
+ }
+
+#if defined(IB_RBT_TESTING)
+ ut_a(rbt_validate(dst));
+ ut_a(rbt_validate(src));
+#endif
+ return(rbt_size(dst) - old_size);
+}
+
+/****************************************************************//**
+Check that every path from the root to the leaves has the same count and
+the tree nodes are in order.
+@return TRUE if OK FALSE otherwise */
+UNIV_INTERN
+ibool
+rbt_validate(
+/*=========*/
+ const ib_rbt_t* tree) /*!< in: RB tree to validate */
+{
+ if (rbt_count_black_nodes(tree, ROOT(tree)) > 0) {
+ return(rbt_check_ordering(tree));
+ }
+
+ return(FALSE);
+}
+
+/****************************************************************//**
+Iterate over the tree in depth first order. */
+UNIV_INTERN
+void
+rbt_print(
+/*======*/
+ const ib_rbt_t* tree, /*!< in: tree to traverse */
+ ib_rbt_print_node print) /*!< in: print function */
+{
+ rbt_print_subtree(tree, ROOT(tree), print);
+}