summaryrefslogtreecommitdiff
path: root/storage/innobase
diff options
context:
space:
mode:
authorVasil Dimov <vasil.dimov@oracle.com>2010-04-12 18:20:41 +0300
committerVasil Dimov <vasil.dimov@oracle.com>2010-04-12 18:20:41 +0300
commitc877ff39bceb4df96acf3e54f7c98a2bed12b8ee (patch)
tree04211a3e5734b73e9f94cff511a4a74ff87075f0 /storage/innobase
parentfe0828b3b8193e086abe740572c9b0cb2b7da671 (diff)
parent410e23a6af8b597cdda0890d6ed9008355edee7a (diff)
downloadmariadb-git-c877ff39bceb4df96acf3e54f7c98a2bed12b8ee.tar.gz
Import branches/innodb+ from SVN on top of storage/innobase.
Diffstat (limited to 'storage/innobase')
-rw-r--r--storage/innobase/CMakeLists.txt16
-rw-r--r--storage/innobase/ChangeLog209
-rw-r--r--storage/innobase/Makefile.am4
-rw-r--r--storage/innobase/btr/btr0btr.c25
-rw-r--r--storage/innobase/btr/btr0cur.c387
-rw-r--r--storage/innobase/btr/btr0pcur.c27
-rw-r--r--storage/innobase/btr/btr0sea.c16
-rw-r--r--storage/innobase/buf/buf0buddy.c6
-rw-r--r--storage/innobase/buf/buf0buf.c554
-rw-r--r--storage/innobase/buf/buf0flu.c684
-rw-r--r--storage/innobase/buf/buf0lru.c129
-rw-r--r--storage/innobase/buf/buf0rea.c8
-rw-r--r--storage/innobase/dict/dict0boot.c10
-rw-r--r--storage/innobase/dict/dict0crea.c121
-rw-r--r--storage/innobase/dict/dict0dict.c44
-rw-r--r--storage/innobase/dict/dict0load.c117
-rw-r--r--storage/innobase/dict/dict0mem.c12
-rw-r--r--storage/innobase/fil/fil0fil.c116
-rw-r--r--storage/innobase/fsp/fsp0fsp.c38
-rw-r--r--storage/innobase/ha/ha0ha.c16
-rw-r--r--storage/innobase/ha/hash0hash.c14
-rw-r--r--storage/innobase/handler/ha_innodb.cc1381
-rw-r--r--storage/innobase/handler/ha_innodb.h35
-rw-r--r--storage/innobase/handler/handler0alter.cc102
-rw-r--r--storage/innobase/handler/i_s.cc5
-rw-r--r--storage/innobase/handler/mysql_addons.cc2
-rw-r--r--storage/innobase/ibuf/ibuf0ibuf.c1525
-rw-r--r--storage/innobase/include/btr0btr.h29
-rw-r--r--storage/innobase/include/btr0btr.ic6
-rw-r--r--storage/innobase/include/btr0cur.h76
-rw-r--r--storage/innobase/include/btr0pcur.h52
-rw-r--r--storage/innobase/include/btr0pcur.ic49
-rw-r--r--storage/innobase/include/buf0buf.h181
-rw-r--r--storage/innobase/include/buf0buf.ic60
-rw-r--r--storage/innobase/include/buf0flu.h28
-rw-r--r--storage/innobase/include/buf0flu.ic37
-rw-r--r--storage/innobase/include/data0type.ic6
-rw-r--r--storage/innobase/include/db0err.h7
-rw-r--r--storage/innobase/include/dict0boot.h3
-rw-r--r--storage/innobase/include/dict0dict.h5
-rw-r--r--storage/innobase/include/dict0mem.h26
-rw-r--r--storage/innobase/include/fil0fil.h17
-rw-r--r--storage/innobase/include/ha_prototypes.h2
-rw-r--r--storage/innobase/include/handler0alter.h2
-rw-r--r--storage/innobase/include/hash0hash.h5
-rw-r--r--storage/innobase/include/hash0hash.ic20
-rw-r--r--storage/innobase/include/ibuf0ibuf.h44
-rw-r--r--storage/innobase/include/ibuf0ibuf.ic11
-rw-r--r--storage/innobase/include/lock0lock.h13
-rw-r--r--storage/innobase/include/log0log.h31
-rw-r--r--storage/innobase/include/log0log.ic11
-rw-r--r--storage/innobase/include/log0recv.h10
-rw-r--r--storage/innobase/include/mem0dbg.h9
-rw-r--r--storage/innobase/include/mem0dbg.ic5
-rw-r--r--storage/innobase/include/mem0mem.h5
-rw-r--r--storage/innobase/include/mem0mem.ic10
-rw-r--r--storage/innobase/include/mtr0mtr.ic5
-rw-r--r--storage/innobase/include/os0file.h505
-rw-r--r--storage/innobase/include/os0file.ic408
-rw-r--r--storage/innobase/include/os0thread.h7
-rw-r--r--storage/innobase/include/pars0pars.h19
-rw-r--r--storage/innobase/include/que0que.h13
-rw-r--r--storage/innobase/include/que0que.ic16
-rw-r--r--storage/innobase/include/row0merge.h2
-rw-r--r--storage/innobase/include/row0mysql.h23
-rw-r--r--storage/innobase/include/row0purge.h22
-rw-r--r--storage/innobase/include/row0row.h20
-rw-r--r--storage/innobase/include/row0sel.h13
-rw-r--r--storage/innobase/include/row0types.h2
-rw-r--r--storage/innobase/include/srv0srv.h130
-rw-r--r--storage/innobase/include/sync0rw.h360
-rw-r--r--storage/innobase/include/sync0rw.ic262
-rw-r--r--storage/innobase/include/sync0sync.h271
-rw-r--r--storage/innobase/include/sync0sync.ic150
-rw-r--r--storage/innobase/include/trx0purge.h6
-rw-r--r--storage/innobase/include/trx0rseg.h21
-rw-r--r--storage/innobase/include/trx0sys.h29
-rw-r--r--storage/innobase/include/trx0trx.h69
-rw-r--r--storage/innobase/include/trx0types.h9
-rw-r--r--storage/innobase/include/trx0undo.ic2
-rw-r--r--storage/innobase/include/univ.i31
-rw-r--r--storage/innobase/include/ut0lst.h4
-rw-r--r--storage/innobase/include/ut0rbt.h293
-rw-r--r--storage/innobase/include/ut0rnd.ic1
-rw-r--r--storage/innobase/include/ut0ut.h14
-rw-r--r--storage/innobase/lock/lock0lock.c237
-rw-r--r--storage/innobase/log/log0log.c50
-rw-r--r--storage/innobase/log/log0recv.c96
-rw-r--r--storage/innobase/mem/mem0dbg.c16
-rw-r--r--storage/innobase/mem/mem0mem.c20
-rw-r--r--storage/innobase/mem/mem0pool.c7
-rw-r--r--storage/innobase/mtr/mtr0mtr.c122
-rw-r--r--storage/innobase/mysql-test/innodb-autoinc-44030.result30
-rw-r--r--storage/innobase/mysql-test/innodb-autoinc-44030.test34
-rw-r--r--storage/innobase/mysql-test/innodb-autoinc.result379
-rw-r--r--storage/innobase/mysql-test/innodb-autoinc.test190
-rw-r--r--storage/innobase/mysql-test/innodb-consistent-master.opt2
-rw-r--r--storage/innobase/mysql-test/innodb-consistent.test116
-rw-r--r--storage/innobase/mysql-test/innodb-index.result11
-rw-r--r--storage/innobase/mysql-test/innodb-index.test28
-rw-r--r--storage/innobase/mysql-test/innodb-master.opt2
-rw-r--r--storage/innobase/mysql-test/innodb-semi-consistent-master.opt2
-rw-r--r--storage/innobase/mysql-test/innodb-use-sys-malloc-master.opt3
-rw-r--r--storage/innobase/mysql-test/innodb-zip.result98
-rw-r--r--storage/innobase/mysql-test/innodb-zip.test46
-rw-r--r--storage/innobase/mysql-test/innodb.result12
-rw-r--r--storage/innobase/mysql-test/innodb.test19
-rw-r--r--storage/innobase/mysql-test/innodb_bug21704.result12
-rw-r--r--storage/innobase/mysql-test/innodb_bug38231.result11
-rw-r--r--storage/innobase/mysql-test/innodb_bug38231.test112
-rw-r--r--storage/innobase/mysql-test/innodb_bug39438-master.opt1
-rw-r--r--storage/innobase/mysql-test/innodb_bug39438.result1
-rw-r--r--storage/innobase/mysql-test/innodb_bug39438.test51
-rw-r--r--storage/innobase/mysql-test/innodb_bug42101-nonzero-master.opt2
-rw-r--r--storage/innobase/mysql-test/innodb_bug44369.result14
-rw-r--r--storage/innobase/mysql-test/innodb_bug44369.test10
-rw-r--r--storage/innobase/mysql-test/innodb_bug44571.result7
-rw-r--r--storage/innobase/mysql-test/innodb_bug44571.test17
-rw-r--r--storage/innobase/mysql-test/innodb_bug46000.result18
-rw-r--r--storage/innobase/mysql-test/innodb_bug46000.test12
-rw-r--r--storage/innobase/mysql-test/innodb_bug47621.result21
-rw-r--r--storage/innobase/mysql-test/innodb_bug47621.test57
-rw-r--r--storage/innobase/mysql-test/innodb_bug47622.result23
-rw-r--r--storage/innobase/mysql-test/innodb_bug47622.test55
-rw-r--r--storage/innobase/mysql-test/innodb_bug47777.result13
-rw-r--r--storage/innobase/mysql-test/innodb_bug47777.test24
-rw-r--r--storage/innobase/mysql-test/innodb_bug51378.result66
-rw-r--r--storage/innobase/mysql-test/innodb_bug51378.test77
-rw-r--r--storage/innobase/mysql-test/innodb_bug51920.result13
-rw-r--r--storage/innobase/mysql-test/innodb_bug51920.test39
-rw-r--r--storage/innobase/mysql-test/innodb_file_format.result2
-rw-r--r--storage/innobase/mysql-test/innodb_information_schema.test16
-rw-r--r--storage/innobase/mysql-test/patches/innodb-index.diff62
-rw-r--r--storage/innobase/mysql-test/patches/innodb_change_buffering_basic.diff60
-rw-r--r--storage/innobase/os/os0file.c907
-rw-r--r--storage/innobase/os/os0thread.c7
-rw-r--r--storage/innobase/page/page0page.c50
-rw-r--r--storage/innobase/pars/pars0pars.c25
-rw-r--r--storage/innobase/plug.in12
-rw-r--r--storage/innobase/rem/rem0rec.c18
-rw-r--r--storage/innobase/row/row0ins.c23
-rw-r--r--storage/innobase/row/row0merge.c203
-rw-r--r--storage/innobase/row/row0mysql.c281
-rw-r--r--storage/innobase/row/row0purge.c201
-rw-r--r--storage/innobase/row/row0row.c48
-rw-r--r--storage/innobase/row/row0sel.c75
-rw-r--r--storage/innobase/row/row0uins.c50
-rw-r--r--storage/innobase/row/row0umod.c112
-rw-r--r--storage/innobase/row/row0upd.c115
-rw-r--r--storage/innobase/srv/srv0srv.c519
-rw-r--r--storage/innobase/srv/srv0start.c196
-rw-r--r--storage/innobase/sync/sync0arr.c8
-rw-r--r--storage/innobase/sync/sync0rw.c21
-rw-r--r--storage/innobase/sync/sync0sync.c114
-rw-r--r--storage/innobase/thr/thr0loc.c8
-rw-r--r--storage/innobase/trx/trx0i_s.c26
-rw-r--r--storage/innobase/trx/trx0purge.c33
-rw-r--r--storage/innobase/trx/trx0rec.c3
-rw-r--r--storage/innobase/trx/trx0roll.c4
-rw-r--r--storage/innobase/trx/trx0rseg.c146
-rw-r--r--storage/innobase/trx/trx0sys.c101
-rw-r--r--storage/innobase/trx/trx0trx.c10
-rw-r--r--storage/innobase/ut/ut0rbt.c1231
-rw-r--r--storage/innobase/ut/ut0ut.c104
-rw-r--r--storage/innobase/ut/ut0wqueue.c4
165 files changed, 13060 insertions, 2981 deletions
diff --git a/storage/innobase/CMakeLists.txt b/storage/innobase/CMakeLists.txt
index e4f7da5e5f3..b63b45d52b4 100644
--- a/storage/innobase/CMakeLists.txt
+++ b/storage/innobase/CMakeLists.txt
@@ -22,13 +22,21 @@ INCLUDE(CheckCSourceRuns)
# OS tests
IF(UNIX)
IF(CMAKE_SYSTEM_NAME STREQUAL "Linux")
+ CHECK_INCLUDE_FILES (libaio.h HAVE_LIBAIO_H)
+ CHECK_LIBRARY_EXISTS(aio io_queue_init "" HAVE_LIBAIO)
ADD_DEFINITIONS("-DUNIV_LINUX -D_GNU_SOURCE=1")
+ IF(HAVE_LIBAIO_H AND HAVE_LIBAIO)
+ ADD_DEFINITIONS(-DLINUX_NATIVE_AIO=1)
+ LINK_LIBRARIES(aio)
+ ENDIF()
ELSEIF(CMAKE_SYSTEM_NAME MATCHES "HP*")
ADD_DEFINITIONS("-DUNIV_HPUX -DUNIV_MUST_NOT_INLINE")
ELSEIF(CMAKE_SYSTEM_NAME STREQUAL "AIX")
ADD_DEFINITIONS("-DUNIV_AIX -DUNIX_MUST_NOT_INLINE")
ELSEIF(CMAKE_SYSTEM_NAME STREQUAL "SunOS")
ADD_DEFINITIONS("-DUNIV_SOLARIS")
+ ELSE()
+ ADD_DEFINITIONS("-DUNIV_MUST_NOT_INLINE")
ENDIF()
ENDIF()
@@ -230,8 +238,12 @@ SET(INNOBASE_SOURCES btr/btr0btr.c btr/btr0cur.c btr/btr0pcur.c btr/btr0sea.c
trx/trx0i_s.c trx/trx0purge.c trx/trx0rec.c trx/trx0roll.c trx/trx0rseg.c
trx/trx0sys.c trx/trx0trx.c trx/trx0undo.c
usr/usr0sess.c
- ut/ut0byte.c ut/ut0dbg.c ut/ut0mem.c ut/ut0rnd.c ut/ut0ut.c ut/ut0vec.c
- ut/ut0list.c ut/ut0wqueue.c)
+ ut/ut0byte.c ut/ut0dbg.c ut/ut0list.c ut/ut0mem.c ut/ut0rbt.c ut/ut0rnd.c
+ ut/ut0ut.c ut/ut0vec.c ut/ut0wqueue.c)
+# Windows atomics do not perform well. Disable Windows atomics by default.
+# See bug#52102 for details.
+#ADD_DEFINITIONS(-DHAVE_WINDOWS_ATOMICS -DINNODB_RW_LOCKS_USE_ATOMICS -DHAVE_IB_PAUSE_INSTRUCTION)
+ADD_DEFINITIONS(-DHAVE_IB_PAUSE_INSTRUCTION)
IF(WITH_INNODB)
# Legacy option
diff --git a/storage/innobase/ChangeLog b/storage/innobase/ChangeLog
index 1a6e07fd147..58b56f1e8a5 100644
--- a/storage/innobase/ChangeLog
+++ b/storage/innobase/ChangeLog
@@ -1,3 +1,200 @@
+2010-03-31 The InnoDB Team
+
+ * mysql-test/innodb_bug51920.test, mysql-test/innodb_bug51920.result,
+ srv/srv0srv.c:
+ Fix Bug#51920 InnoDB connections in row lock wait ignore KILL
+ until lock wait timeout
+
+2010-03-31 The InnoDB Team
+
+ * mysql-test/innodb_bug38231.test:
+ Remove non-determinism in the test case.
+
+2010-03-18 The InnoDB Team
+
+ * CMakeLists.txt:
+ Fix Bug#52102 InnoDB Plugin shows performance drop compared to
+ InnoDB (Windows)
+
+2010-03-18 The InnoDB Team
+
+ * buf0buf.ic:
+ When comparing the time of the first access to a block against
+ innodb_old_blocks_time, use 32-bit arithmetics. The comparison was
+ incorrect on 64-bit systems.
+
+2010-03-11 The InnoDB Team
+
+ * buf0buf.h, buf0buf.ic:
+ Fix and clarify the latching of some buf_block_t members.
+ Note that check_index_page_at_flush is not protected by any mutex.
+ Note and assert that lock_hash_val is protected by the rw-latch.
+
+2010-03-10 The InnoDB Team
+
+ * trx/trx0sys.c:
+ Fix Bug#51653 outdated reference to set-variable
+
+2010-03-10 The InnoDB Team
+
+ * handler/ha_innodb.cc, mysql-test/innodb_bug21704.result,
+ mysql-test/innodb_bug47621.result, mysql-test/innodb_bug47621.test:
+ Fix Bug#47621 MySQL and InnoDB data dictionaries will become out of
+ sync when renaming columns
+
+2010-03-10 The InnoDB Team
+
+ * handler/ha_innodb.cc:
+ Fix Bug#51356 Many Valgrind errors in error messages
+ with concurrent DDL
+
+2010-03-10 The InnoDB Team
+
+ * handler/ha_innodb.cc, handler/handler0alter.cc,
+ mysql-test/innodb_bug51378.result, mysql-test/innodb_bug51378.test:
+ Fix Bug#51378 Init 'ref_length' to correct value, in case an out
+ of bound MySQL primary_key
+
+2010-03-10 The InnoDB Team
+
+ * log/log0recv.c:
+ Remove a bogus assertion about page numbers exceeding 0x90000000
+ in the redo log. Abort when encountering a corrupted redo log
+ record, unless innodb_force_recovery is set.
+
+2010-03-09 The InnoDB Team
+
+ * handler/ha_innodb.cc:
+ Make SHOW ENGINE INNODB MUTEX STATUS display SUM(os_waits)
+ for the buffer pool block mutexes and locks.
+
+2010-03-08 The InnoDB Team
+
+ * fil/fil0fil.c:
+ Fix ALTER TABLE ... IMPORT TABLESPACE of compressed tables.
+
+2010-03-03 The InnoDB Team
+
+ * handler/handler0alter.cc, innodb-index.result, innodb-index.test,
+ innodb.result, innodb.test:
+ Disallow a duplicate index name when creating an index.
+
+2010-02-11 The InnoDB Team
+
+ * include/mem0mem.h, include/mem0mem.ic, mem/mem0mem.c:
+ Fix Bug#49535 Available memory check slows down crash
+ recovery tens of times
+
+2010-02-09 The InnoDB Team
+
+ * buf/buf0buf.c:
+ Fix Bug#38901 InnoDB logs error repeatedly when trying to load
+ page into buffer pool
+
+2010-02-09 The InnoDB Team
+
+ * srv/srv0srv.c:
+ Let the master thread sleep if the amount of work to be done is
+ calibrated as taking less than a second.
+
+2010-02-04 The InnoDB Team
+
+ * btr/btr0btr.c, btr/btr0cur.c, btr/btr0pcur.c, buf/buf0buf.c,
+ include/btr0btr.h, include/btr0cur.h, include/btr0pcur.h,
+ include/btr0pcur.ic, include/buf0buf.h, row/row0ins.c, row/row0sel.c:
+ Pass the file name and line number of the caller of the
+ b-tree cursor functions to the buffer pool requests, in order
+ to make the latch diagnostics more accurate.
+
+2010-02-03 The InnoDB Team
+
+ * lock/lock0lock.c:
+ Fix Bug#49001 SHOW INNODB STATUS deadlock info incorrect
+ when deadlock detection aborts
+
+2010-02-03 The InnoDB Team
+
+ * buf/buf0lru.c:
+ Fix Bug#35077 Very slow DROP TABLE (ALTER TABLE, OPTIMIZE TABLE)
+ on compressed tables
+
+2010-02-03 The InnoDB Team
+
+ * handler/ha_innodb.cc, include/row0mysql.h, row/row0mysql.c:
+ Clean up CHECK TABLE error handling.
+
+2010-02-01 The InnoDB Team
+
+ * handler/ha_innodb.cc, mysql-test/innodb-autoinc.test,
+ mysql-test/innodb-autoinc.result,
+ mysql-test/innodb-autoinc-44030.test,
+ mysql-test/innodb-autoinc-44030.result:
+ Fix Bug#49497 Error 1467 (ER_AUTOINC_READ_FAILED) on inserting
+ a negative value
+
+2010-01-27 The InnoDB Team
+
+ * include/row0mysql.h, log/log0recv.c, row/row0mysql.c:
+ Drop temporary tables at startup.
+ This addresses the third aspect of
+ Bug#41609 Crash recovery does not work for InnoDB temporary tables.
+
+2010-01-21 The InnoDB Team
+
+ * buf/buf0buf.c:
+ Do not merge buffered inserts to compressed pages before
+ the redo log has been applied in crash recovery.
+
+2010-01-13 The InnoDB Team
+
+ * row/row0sel.c:
+ On the READ UNCOMMITTED isolation level, do not attempt to access
+ a clustered index record that has been marked for deletion. The
+ built-in InnoDB in MySQL 5.1 and earlier would attempt to retrieve
+ a previous version of the record in this case.
+
+2010-01-13 The InnoDB Team
+
+ * buf/buf0buf.c:
+ When disabling the adaptive hash index, check the block state
+ before checking block->is_hashed, because the latter may be
+ uninitialized right after server startup.
+
+2010-01-12 The InnoDB Team
+
+ * handler/ha_innodb.cc, handler/ha_innodb.h:
+ Fix Bug#46193 crash when accessing tables after enabling
+ innodb_force_recovery option
+
+2010-01-12 The InnoDB Team
+
+ * row/row0mysql.c:
+ Fix Bug#49238 Creating/Dropping a temporary table while at 1023
+ transactions will cause assert.
+
+2009-12-02 The InnoDB Team
+
+ * srv/srv0start.c:
+ Display the zlib version number at startup.
+ InnoDB compressed tables use zlib, and the implementation depends
+ on the zlib function compressBound(), whose definition was slightly
+ changed in zlib version 1.2.3.1 in 2006. MySQL bundles zlib 1.2.3
+ from 2005, but some installations use a more recent zlib.
+
+2009-11-30 The InnoDB Team
+
+ * dict/dict0crea.c, dict/dict0mem.c, dict/dict0load.c,
+ dict/dict0boot.c, fil/fil0fil.c, handler/ha_innodb.cc,
+ include/dict0mem.h, row/row0mysql.c:
+ Fix the bogus warning messages for non-existing temporary
+ tables that were reported in
+ Bug#41609 Crash recovery does not work for InnoDB temporary tables.
+ The actual crash recovery bug was corrected on 2009-04-29.
+
+2009-11-27 The InnoDB Team
+
+ InnoDB Plugin 1.0.6 released
+
2009-11-20 The InnoDB Team
* handler/ha_innodb.cc:
@@ -79,8 +276,8 @@
sync/sync0arr.c, sync/sync0sync.c, thr/thr0loc.c, trx/trx0i_s.c,
trx/trx0purge.c, trx/trx0rseg.c, trx/trx0sys.c, trx/trx0undo.c,
usr/usr0sess.c, ut/ut0mem.c:
- Fix Bug #45992 innodb memory not freed after shutdown
- Fix Bug #46656 InnoDB plugin: memory leaks (Valgrind)
+ Fix Bug#45992 innodb memory not freed after shutdown
+ Fix Bug#46656 InnoDB plugin: memory leaks (Valgrind)
2009-10-29 The InnoDB Team
@@ -422,7 +619,7 @@
* dict/dict0dict.c:
When an index column cannot be found in the table during index
creation, display additional diagnostic before an assertion failure.
- This does NOT fix Bug #44571 InnoDB Plugin crashes on ADD INDEX,
+ This does NOT fix Bug#44571 InnoDB Plugin crashes on ADD INDEX,
but it helps understand the reason of the crash.
2009-06-17 The InnoDB Team
@@ -535,6 +732,12 @@
Fix Bug#44320 InnoDB: missing DB_ROLL_PTR in Table Monitor COLUMNS
output
+2009-04-29 The InnoDB Team
+
+ * fil/fil0fil.c, include/fil0fil.h, include/mtr0mtr.h,
+ log/log0recv.c:
+ Fix Bug#41609 Crash recovery does not work for InnoDB temporary tables
+
2009-04-23 The InnoDB Team
* row/row0mysql.c:
diff --git a/storage/innobase/Makefile.am b/storage/innobase/Makefile.am
index 6051fe9ef96..e64a92519e1 100644
--- a/storage/innobase/Makefile.am
+++ b/storage/innobase/Makefile.am
@@ -28,7 +28,6 @@ INCLUDES= -I$(top_srcdir)/include -I$(top_builddir)/include \
DEFS= @DEFS@
-
noinst_HEADERS= \
handler/ha_innodb.h \
handler/i_s.h \
@@ -118,6 +117,7 @@ noinst_HEADERS= \
include/mtr0types.h \
include/mysql_addons.h \
include/os0file.h \
+ include/os0file.ic \
include/os0proc.h \
include/os0proc.ic \
include/os0sync.h \
@@ -217,6 +217,7 @@ noinst_HEADERS= \
include/ut0lst.h \
include/ut0mem.h \
include/ut0mem.ic \
+ include/ut0rbt.h \
include/ut0rnd.h \
include/ut0rnd.ic \
include/ut0sort.h \
@@ -318,6 +319,7 @@ libinnobase_a_SOURCES= \
ut/ut0dbg.c \
ut/ut0list.c \
ut/ut0mem.c \
+ ut/ut0rbt.c \
ut/ut0rnd.c \
ut/ut0ut.c \
ut/ut0vec.c \
diff --git a/storage/innobase/btr/btr0btr.c b/storage/innobase/btr/btr0btr.c
index 086b3a0a599..8589d415131 100644
--- a/storage/innobase/btr/btr0btr.c
+++ b/storage/innobase/btr/btr0btr.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -592,15 +592,18 @@ an x-latch on the tree.
@return rec_get_offsets() of the node pointer record */
static
ulint*
-btr_page_get_father_node_ptr(
-/*=========================*/
+btr_page_get_father_node_ptr_func(
+/*==============================*/
ulint* offsets,/*!< in: work area for the return value */
mem_heap_t* heap, /*!< in: memory heap to use */
btr_cur_t* cursor, /*!< in: cursor pointing to user record,
out: cursor on node pointer record,
its page x-latched */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
mtr_t* mtr) /*!< in: mtr */
{
+ page_t* page;
dtuple_t* tuple;
rec_t* user_rec;
rec_t* node_ptr;
@@ -617,12 +620,15 @@ btr_page_get_father_node_ptr(
ut_ad(dict_index_get_page(index) != page_no);
level = btr_page_get_level(btr_cur_get_page(cursor), mtr);
+
+ page = btr_cur_get_page(cursor);
user_rec = btr_cur_get_rec(cursor);
ut_a(page_rec_is_user_rec(user_rec));
tuple = dict_index_build_node_ptr(index, user_rec, 0, heap, level);
btr_cur_search_to_nth_level(index, level + 1, tuple, PAGE_CUR_LE,
- BTR_CONT_MODIFY_TREE, cursor, 0, mtr);
+ BTR_CONT_MODIFY_TREE, cursor, 0,
+ file, line, mtr);
node_ptr = btr_cur_get_rec(cursor);
ut_ad(!page_rec_is_comp(node_ptr)
@@ -670,6 +676,9 @@ btr_page_get_father_node_ptr(
return(offsets);
}
+#define btr_page_get_father_node_ptr(of,heap,cur,mtr) \
+ btr_page_get_father_node_ptr_func(of,heap,cur,__FILE__,__LINE__,mtr)
+
/************************************************************//**
Returns the upper level node pointer to a page. It is assumed that mtr holds
an x-latch on the tree.
@@ -1662,11 +1671,13 @@ Inserts a data tuple to a tree on a non-leaf level. It is assumed
that mtr holds an x-latch on the tree. */
UNIV_INTERN
void
-btr_insert_on_non_leaf_level(
-/*=========================*/
+btr_insert_on_non_leaf_level_func(
+/*==============================*/
dict_index_t* index, /*!< in: index */
ulint level, /*!< in: level, must be > 0 */
dtuple_t* tuple, /*!< in: the record to be inserted */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
mtr_t* mtr) /*!< in: mtr */
{
big_rec_t* dummy_big_rec;
@@ -1678,7 +1689,7 @@ btr_insert_on_non_leaf_level(
btr_cur_search_to_nth_level(index, level, tuple, PAGE_CUR_LE,
BTR_CONT_MODIFY_TREE,
- &cursor, 0, mtr);
+ &cursor, 0, file, line, mtr);
err = btr_cur_pessimistic_insert(BTR_NO_LOCKING_FLAG
| BTR_KEEP_SYS_FLAG
diff --git a/storage/innobase/btr/btr0cur.c b/storage/innobase/btr/btr0cur.c
index 46dfb5d1a46..3ca2b02bb4b 100644
--- a/storage/innobase/btr/btr0cur.c
+++ b/storage/innobase/btr/btr0cur.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
Copyright (c) 2008, Google Inc.
Portions of this file contain modifications contributed and copyrighted by
@@ -57,6 +57,8 @@ Created 10/16/1994 Heikki Tuuri
#include "buf0lru.h"
#include "btr0btr.h"
#include "btr0sea.h"
+#include "row0purge.h"
+#include "row0upd.h"
#include "trx0rec.h"
#include "trx0roll.h" /* trx_is_recv() */
#include "que0que.h"
@@ -66,6 +68,15 @@ Created 10/16/1994 Heikki Tuuri
#include "lock0lock.h"
#include "zlib.h"
+/** Buffered B-tree operation types, introduced as part of delete buffering. */
+typedef enum btr_op_enum {
+ BTR_NO_OP = 0, /*!< Not buffered */
+ BTR_INSERT_OP, /*!< Insert, do not ignore UNIQUE */
+ BTR_INSERT_IGNORE_UNIQUE_OP, /*!< Insert, ignoring UNIQUE */
+ BTR_DELETE_OP, /*!< Purge a delete-marked record */
+ BTR_DELMARK_OP /*!< Mark a record for deletion */
+} btr_op_t;
+
#ifdef UNIV_DEBUG
/** If the following is set to TRUE, this module prints a lot of
trace information of individual record operations */
@@ -328,7 +339,8 @@ btr_cur_search_to_nth_level(
Inserts should always be made using
PAGE_CUR_LE to search the position! */
ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ..., ORed with
- BTR_INSERT and BTR_ESTIMATE;
+ at most one of BTR_INSERT, BTR_DELETE_MARK,
+ BTR_DELETE, or BTR_ESTIMATE;
cursor->left_block is used to store a pointer
to the left neighbor page, in the cases
BTR_SEARCH_PREV and BTR_MODIFY_PREV;
@@ -342,25 +354,30 @@ btr_cur_search_to_nth_level(
ulint has_search_latch,/*!< in: info on the latch mode the
caller currently has on btr_search_latch:
RW_S_LATCH, or 0 */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
mtr_t* mtr) /*!< in: mtr */
{
- page_cur_t* page_cursor;
page_t* page;
+ buf_block_t* block;
+ ulint space;
buf_block_t* guess;
- rec_t* node_ptr;
+ ulint height;
ulint page_no;
- ulint space;
ulint up_match;
ulint up_bytes;
ulint low_match;
ulint low_bytes;
- ulint height;
ulint savepoint;
+ ulint rw_latch;
ulint page_mode;
- ulint insert_planned;
+ ulint buf_mode;
ulint estimate;
- ulint ignore_sec_unique;
+ ulint zip_size;
+ page_cur_t* page_cursor;
+ btr_op_t btr_op;
ulint root_height = 0; /* remove warning */
+
#ifdef BTR_CUR_ADAPT
btr_search_t* info;
#endif
@@ -380,17 +397,53 @@ btr_cur_search_to_nth_level(
cursor->up_match = ULINT_UNDEFINED;
cursor->low_match = ULINT_UNDEFINED;
#endif
- insert_planned = latch_mode & BTR_INSERT;
+
+ /* These flags are mutually exclusive, they are lumped together
+ with the latch mode for historical reasons. It's possible for
+ none of the flags to be set. */
+ switch (UNIV_EXPECT(latch_mode
+ & (BTR_INSERT | BTR_DELETE | BTR_DELETE_MARK),
+ 0)) {
+ case 0:
+ btr_op = BTR_NO_OP;
+ break;
+ case BTR_INSERT:
+ btr_op = (latch_mode & BTR_IGNORE_SEC_UNIQUE)
+ ? BTR_INSERT_IGNORE_UNIQUE_OP
+ : BTR_INSERT_OP;
+ break;
+ case BTR_DELETE:
+ btr_op = BTR_DELETE_OP;
+ ut_a(cursor->purge_node);
+ break;
+ case BTR_DELETE_MARK:
+ btr_op = BTR_DELMARK_OP;
+ break;
+ default:
+ /* only one of BTR_INSERT, BTR_DELETE, BTR_DELETE_MARK
+ should be specified at a time */
+ ut_error;
+ }
+
+ /* Operations on the insert buffer tree cannot be buffered. */
+ ut_ad(btr_op == BTR_NO_OP || !dict_index_is_ibuf(index));
+ /* Operations on the clustered index cannot be buffered. */
+ ut_ad(btr_op == BTR_NO_OP || !dict_index_is_clust(index));
+
estimate = latch_mode & BTR_ESTIMATE;
- ignore_sec_unique = latch_mode & BTR_IGNORE_SEC_UNIQUE;
- latch_mode = latch_mode & ~(BTR_INSERT | BTR_ESTIMATE
- | BTR_IGNORE_SEC_UNIQUE);
- ut_ad(!insert_planned || (mode == PAGE_CUR_LE));
+ /* Turn the flags unrelated to the latch mode off. */
+ latch_mode &= ~(BTR_INSERT
+ | BTR_DELETE_MARK
+ | BTR_DELETE
+ | BTR_ESTIMATE
+ | BTR_IGNORE_SEC_UNIQUE);
cursor->flag = BTR_CUR_BINARY;
cursor->index = index;
+ cursor->ibuf_cnt = ULINT_UNDEFINED;
+
#ifndef BTR_CUR_ADAPT
guess = NULL;
#else
@@ -404,7 +457,8 @@ btr_cur_search_to_nth_level(
info->n_searches++;
#endif
if (rw_lock_get_writer(&btr_search_latch) == RW_LOCK_NOT_LOCKED
- && latch_mode <= BTR_MODIFY_LEAF && info->last_hash_succ
+ && latch_mode <= BTR_MODIFY_LEAF
+ && info->last_hash_succ
&& !estimate
#ifdef PAGE_CUR_LE_OR_EXTENDS
&& mode != PAGE_CUR_LE_OR_EXTENDS
@@ -493,156 +547,219 @@ btr_cur_search_to_nth_level(
/* Loop and search until we arrive at the desired level */
- for (;;) {
- ulint zip_size;
- buf_block_t* block;
- ulint rw_latch;
- ulint buf_mode;
-
- zip_size = dict_table_zip_size(index->table);
- rw_latch = RW_NO_LATCH;
- buf_mode = BUF_GET;
+search_loop:
+ buf_mode = BUF_GET;
+ rw_latch = RW_NO_LATCH;
- if (height == 0 && latch_mode <= BTR_MODIFY_LEAF) {
+ if (height != 0) {
+ /* We are about to fetch the root or a non-leaf page. */
+ } else if (latch_mode <= BTR_MODIFY_LEAF) {
+ rw_latch = latch_mode;
- rw_latch = latch_mode;
+ if (btr_op != BTR_NO_OP
+ && ibuf_should_try(index, btr_op != BTR_INSERT_OP)) {
- if (insert_planned
- && ibuf_should_try(index, ignore_sec_unique)) {
+ /* Try to buffer the operation if the leaf
+ page is not in the buffer pool. */
- /* Try insert to the insert buffer if the
- page is not in the buffer pool */
-
- buf_mode = BUF_GET_IF_IN_POOL;
- }
+ buf_mode = btr_op == BTR_DELETE_OP
+ ? BUF_GET_IF_IN_POOL_OR_WATCH
+ : BUF_GET_IF_IN_POOL;
}
+ }
+
+ zip_size = dict_table_zip_size(index->table);
retry_page_get:
- block = buf_page_get_gen(space, zip_size, page_no,
- rw_latch, guess, buf_mode,
- __FILE__, __LINE__, mtr);
- if (block == NULL) {
- /* This must be a search to perform an insert;
- try insert to the insert buffer */
+ block = buf_page_get_gen(
+ space, zip_size, page_no, rw_latch, guess, buf_mode,
+ file, line, mtr);
+
+ if (block == NULL) {
+ /* This must be a search to perform an insert/delete
+ mark/ delete; try using the insert/delete buffer */
+
+ ut_ad(height == 0);
+ ut_ad(cursor->thr);
+ switch (btr_op) {
+ case BTR_INSERT_OP:
+ case BTR_INSERT_IGNORE_UNIQUE_OP:
ut_ad(buf_mode == BUF_GET_IF_IN_POOL);
- ut_ad(insert_planned);
- ut_ad(cursor->thr);
- if (ibuf_insert(tuple, index, space, zip_size,
- page_no, cursor->thr)) {
- /* Insertion to the insert buffer succeeded */
+ if (ibuf_insert(IBUF_OP_INSERT, tuple, index,
+ space, zip_size, page_no,
+ cursor->thr)) {
+
cursor->flag = BTR_CUR_INSERT_TO_IBUF;
- if (UNIV_LIKELY_NULL(heap)) {
- mem_heap_free(heap);
- }
+
goto func_exit;
}
+ break;
- /* Insert to the insert buffer did not succeed:
- retry page get */
+ case BTR_DELMARK_OP:
+ ut_ad(buf_mode == BUF_GET_IF_IN_POOL);
- buf_mode = BUF_GET;
+ if (ibuf_insert(IBUF_OP_DELETE_MARK, tuple,
+ index, space, zip_size,
+ page_no, cursor->thr)) {
- goto retry_page_get;
+ cursor->flag = BTR_CUR_DEL_MARK_IBUF;
+
+ goto func_exit;
+ }
+
+ break;
+
+ case BTR_DELETE_OP:
+ ut_ad(buf_mode == BUF_GET_IF_IN_POOL_OR_WATCH);
+
+ if (!row_purge_poss_sec(cursor->purge_node,
+ index, tuple)) {
+
+ /* The record cannot be purged yet. */
+ cursor->flag = BTR_CUR_DELETE_REF;
+ } else if (ibuf_insert(IBUF_OP_DELETE, tuple,
+ index, space, zip_size,
+ page_no,
+ cursor->thr)) {
+
+ /* The purge was buffered. */
+ cursor->flag = BTR_CUR_DELETE_IBUF;
+ } else {
+ /* The purge could not be buffered. */
+ buf_pool_watch_unset(space, page_no);
+ break;
+ }
+
+ buf_pool_watch_unset(space, page_no);
+ goto func_exit;
+
+ default:
+ ut_error;
}
- page = buf_block_get_frame(block);
+ /* Insert to the insert/delete buffer did not succeed, we
+ must read the page from disk. */
- block->check_index_page_at_flush = TRUE;
+ buf_mode = BUF_GET;
+
+ goto retry_page_get;
+ }
+
+ block->check_index_page_at_flush = TRUE;
+ page = buf_block_get_frame(block);
- if (rw_latch != RW_NO_LATCH) {
+ if (rw_latch != RW_NO_LATCH) {
#ifdef UNIV_ZIP_DEBUG
- const page_zip_des_t* page_zip
- = buf_block_get_page_zip(block);
- ut_a(!page_zip || page_zip_validate(page_zip, page));
+ const page_zip_des_t* page_zip
+ = buf_block_get_page_zip(block);
+ ut_a(!page_zip || page_zip_validate(page_zip, page));
#endif /* UNIV_ZIP_DEBUG */
- buf_block_dbg_add_level(block, SYNC_TREE_NODE);
- }
+ buf_block_dbg_add_level(block, SYNC_TREE_NODE);
+ }
- ut_ad(0 == ut_dulint_cmp(index->id,
- btr_page_get_index_id(page)));
+ ut_ad(0 == ut_dulint_cmp(index->id, btr_page_get_index_id(page)));
- if (UNIV_UNLIKELY(height == ULINT_UNDEFINED)) {
- /* We are in the root node */
+ if (UNIV_UNLIKELY(height == ULINT_UNDEFINED)) {
+ /* We are in the root node */
+
+ height = btr_page_get_level(page, mtr);
+ root_height = height;
+ cursor->tree_height = root_height + 1;
- height = btr_page_get_level(page, mtr);
- root_height = height;
- cursor->tree_height = root_height + 1;
#ifdef BTR_CUR_ADAPT
- if (block != guess) {
- info->root_guess = block;
- }
-#endif
+ if (block != guess) {
+ info->root_guess = block;
}
+#endif
+ }
- if (height == 0) {
- if (rw_latch == RW_NO_LATCH) {
-
- btr_cur_latch_leaves(page, space, zip_size,
- page_no, latch_mode,
- cursor, mtr);
- }
+ if (height == 0) {
+ if (rw_latch == RW_NO_LATCH) {
- if ((latch_mode != BTR_MODIFY_TREE)
- && (latch_mode != BTR_CONT_MODIFY_TREE)) {
+ btr_cur_latch_leaves(
+ page, space, zip_size, page_no, latch_mode,
+ cursor, mtr);
+ }
- /* Release the tree s-latch */
+ if (latch_mode != BTR_MODIFY_TREE
+ && latch_mode != BTR_CONT_MODIFY_TREE) {
- mtr_release_s_latch_at_savepoint(
- mtr, savepoint,
- dict_index_get_lock(index));
- }
+ /* Release the tree s-latch */
- page_mode = mode;
+ mtr_release_s_latch_at_savepoint(
+ mtr, savepoint, dict_index_get_lock(index));
}
- page_cur_search_with_match(block, index, tuple, page_mode,
- &up_match, &up_bytes,
- &low_match, &low_bytes,
- page_cursor);
-
- if (estimate) {
- btr_cur_add_path_info(cursor, height, root_height);
- }
+ page_mode = mode;
+ }
- /* If this is the desired level, leave the loop */
+ page_cur_search_with_match(
+ block, index, tuple, page_mode, &up_match, &up_bytes,
+ &low_match, &low_bytes, page_cursor);
- ut_ad(height == btr_page_get_level(
- page_cur_get_page(page_cursor), mtr));
+ if (estimate) {
+ btr_cur_add_path_info(cursor, height, root_height);
+ }
- if (level == height) {
+ /* If this is the desired level, leave the loop */
- if (level > 0) {
- /* x-latch the page */
- page = btr_page_get(space, zip_size,
- page_no, RW_X_LATCH, mtr);
- ut_a((ibool)!!page_is_comp(page)
- == dict_table_is_comp(index->table));
- }
+ ut_ad(height == btr_page_get_level(page_cur_get_page(page_cursor),
+ mtr));
- break;
- }
+ if (level != height) {
+ const rec_t* node_ptr;
ut_ad(height > 0);
height--;
-
guess = NULL;
node_ptr = page_cur_get_rec(page_cursor);
- offsets = rec_get_offsets(node_ptr, cursor->index, offsets,
- ULINT_UNDEFINED, &heap);
+
+ offsets = rec_get_offsets(
+ node_ptr, index, offsets, ULINT_UNDEFINED, &heap);
+
/* Go to the child node */
page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets);
- }
- if (UNIV_LIKELY_NULL(heap)) {
- mem_heap_free(heap);
+ if (UNIV_UNLIKELY(height == 0 && dict_index_is_ibuf(index))) {
+ /* We're doing a search on an ibuf tree and we're one
+ level above the leaf page. */
+
+ ulint is_min_rec;
+
+ ut_ad(level == 0);
+
+ is_min_rec = rec_get_info_bits(node_ptr, 0)
+ & REC_INFO_MIN_REC_FLAG;
+
+ if (!is_min_rec) {
+ cursor->ibuf_cnt
+ = ibuf_rec_get_counter(node_ptr);
+
+ ut_a(cursor->ibuf_cnt <= 0xFFFF
+ || cursor->ibuf_cnt == ULINT_UNDEFINED);
+ }
+
+ buf_mode = BUF_GET;
+ rw_latch = RW_NO_LATCH;
+ goto retry_page_get;
+ }
+
+ goto search_loop;
}
- if (level == 0) {
+ if (level != 0) {
+ /* x-latch the page */
+ page = btr_page_get(
+ space, zip_size, page_no, RW_X_LATCH, mtr);
+
+ ut_a((ibool)!!page_is_comp(page)
+ == dict_table_is_comp(index->table));
+ } else {
cursor->low_match = low_match;
cursor->low_bytes = low_bytes;
cursor->up_match = up_match;
@@ -667,6 +784,11 @@ retry_page_get:
}
func_exit:
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+
if (has_search_latch) {
rw_lock_s_lock(&btr_search_latch);
@@ -677,13 +799,15 @@ func_exit:
Opens a cursor at either end of an index. */
UNIV_INTERN
void
-btr_cur_open_at_index_side(
-/*=======================*/
+btr_cur_open_at_index_side_func(
+/*============================*/
ibool from_left, /*!< in: TRUE if open to the low end,
FALSE if to the high end */
dict_index_t* index, /*!< in: index */
ulint latch_mode, /*!< in: latch mode */
btr_cur_t* cursor, /*!< in: cursor */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
mtr_t* mtr) /*!< in: mtr */
{
page_cur_t* page_cursor;
@@ -728,7 +852,7 @@ btr_cur_open_at_index_side(
page_t* page;
block = buf_page_get_gen(space, zip_size, page_no,
RW_NO_LATCH, NULL, BUF_GET,
- __FILE__, __LINE__, mtr);
+ file, line, mtr);
page = buf_block_get_frame(block);
ut_ad(0 == ut_dulint_cmp(index->id,
btr_page_get_index_id(page)));
@@ -808,11 +932,13 @@ btr_cur_open_at_index_side(
Positions a cursor at a randomly chosen position within a B-tree. */
UNIV_INTERN
void
-btr_cur_open_at_rnd_pos(
-/*====================*/
+btr_cur_open_at_rnd_pos_func(
+/*=========================*/
dict_index_t* index, /*!< in: index */
ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */
btr_cur_t* cursor, /*!< in/out: B-tree cursor */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
mtr_t* mtr) /*!< in: mtr */
{
page_cur_t* page_cursor;
@@ -847,7 +973,7 @@ btr_cur_open_at_rnd_pos(
block = buf_page_get_gen(space, zip_size, page_no,
RW_NO_LATCH, NULL, BUF_GET,
- __FILE__, __LINE__, mtr);
+ file, line, mtr);
page = buf_block_get_frame(block);
ut_ad(0 == ut_dulint_cmp(index->id,
btr_page_get_index_id(page)));
@@ -2729,25 +2855,26 @@ btr_cur_del_mark_set_sec_rec(
}
/***********************************************************//**
-Clear a secondary index record's delete mark. This function is only
-used by the insert buffer insert merge mechanism. */
+Sets a secondary index record's delete mark to the given value. This
+function is only used by the insert buffer merge mechanism. */
UNIV_INTERN
void
-btr_cur_del_unmark_for_ibuf(
-/*========================*/
- rec_t* rec, /*!< in/out: record to delete unmark */
+btr_cur_set_deleted_flag_for_ibuf(
+/*==============================*/
+ rec_t* rec, /*!< in/out: record */
page_zip_des_t* page_zip, /*!< in/out: compressed page
corresponding to rec, or NULL
when the tablespace is
uncompressed */
+ ibool val, /*!< in: value to set */
mtr_t* mtr) /*!< in: mtr */
{
/* We do not need to reserve btr_search_latch, as the page has just
been read to the buffer pool and there cannot be a hash index to it. */
- btr_rec_set_deleted_flag(rec, page_zip, FALSE);
+ btr_rec_set_deleted_flag(rec, page_zip, val);
- btr_cur_del_mark_set_sec_rec_log(rec, FALSE, mtr);
+ btr_cur_del_mark_set_sec_rec_log(rec, val, mtr);
}
/*==================== B-TREE RECORD REMOVE =========================*/
@@ -3100,7 +3227,8 @@ btr_estimate_n_rows_in_range(
btr_cur_search_to_nth_level(index, 0, tuple1, mode1,
BTR_SEARCH_LEAF | BTR_ESTIMATE,
- &cursor, 0, &mtr);
+ &cursor, 0,
+ __FILE__, __LINE__, &mtr);
} else {
btr_cur_open_at_index_side(TRUE, index,
BTR_SEARCH_LEAF | BTR_ESTIMATE,
@@ -3117,7 +3245,8 @@ btr_estimate_n_rows_in_range(
btr_cur_search_to_nth_level(index, 0, tuple2, mode2,
BTR_SEARCH_LEAF | BTR_ESTIMATE,
- &cursor, 0, &mtr);
+ &cursor, 0,
+ __FILE__, __LINE__, &mtr);
} else {
btr_cur_open_at_index_side(FALSE, index,
BTR_SEARCH_LEAF | BTR_ESTIMATE,
@@ -4252,7 +4381,7 @@ btr_free_externally_stored_field(
/* In the rollback of uncommitted transactions, we may
encounter a clustered index record whose BLOBs have
not been written. There is nothing to free then. */
- ut_a(rb_ctx == RB_RECOVERY);
+ ut_a(rb_ctx == RB_RECOVERY || rb_ctx == RB_RECOVERY_PURGE_REC);
return;
}
@@ -4298,7 +4427,7 @@ btr_free_externally_stored_field(
|| (mach_read_from_1(field_ref + BTR_EXTERN_LEN)
& BTR_EXTERN_OWNER_FLAG)
/* Rollback and inherited field */
- || (rb_ctx != RB_NONE
+ || ((rb_ctx == RB_NORMAL || rb_ctx == RB_RECOVERY)
&& (mach_read_from_1(field_ref + BTR_EXTERN_LEN)
& BTR_EXTERN_INHERITED_FLAG))) {
diff --git a/storage/innobase/btr/btr0pcur.c b/storage/innobase/btr/btr0pcur.c
index ec98692c35b..658901208ef 100644
--- a/storage/innobase/btr/btr0pcur.c
+++ b/storage/innobase/btr/btr0pcur.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -205,10 +205,12 @@ record and it can be restored on a user record whose ordering fields
are identical to the ones of the original user record */
UNIV_INTERN
ibool
-btr_pcur_restore_position(
-/*======================*/
+btr_pcur_restore_position_func(
+/*===========================*/
ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */
btr_pcur_t* cursor, /*!< in: detached persistent cursor */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
mtr_t* mtr) /*!< in: mtr */
{
dict_index_t* index;
@@ -217,6 +219,9 @@ btr_pcur_restore_position(
ulint old_mode;
mem_heap_t* heap;
+ ut_ad(mtr);
+ ut_ad(mtr->state == MTR_ACTIVE);
+
index = btr_cur_get_index(btr_pcur_get_btr_cur(cursor));
if (UNIV_UNLIKELY(cursor->old_stored != BTR_PCUR_OLD_STORED)
@@ -257,7 +262,8 @@ btr_pcur_restore_position(
if (UNIV_LIKELY(buf_page_optimistic_get(
latch_mode,
cursor->block_when_stored,
- cursor->modify_clock, mtr))) {
+ cursor->modify_clock,
+ file, line, mtr))) {
cursor->pos_state = BTR_PCUR_IS_POSITIONED;
buf_block_dbg_add_level(btr_pcur_get_block(cursor),
@@ -312,8 +318,8 @@ btr_pcur_restore_position(
mode = PAGE_CUR_L;
}
- btr_pcur_open_with_no_init(index, tuple, mode, latch_mode,
- cursor, 0, mtr);
+ btr_pcur_open_with_no_init_func(index, tuple, mode, latch_mode,
+ cursor, 0, file, line, mtr);
/* Restore the old search mode */
cursor->search_mode = old_mode;
@@ -553,8 +559,8 @@ before first in tree. The latching mode must be BTR_SEARCH_LEAF or
BTR_MODIFY_LEAF. */
UNIV_INTERN
void
-btr_pcur_open_on_user_rec(
-/*======================*/
+btr_pcur_open_on_user_rec_func(
+/*===========================*/
dict_index_t* index, /*!< in: index */
const dtuple_t* tuple, /*!< in: tuple on which search done */
ulint mode, /*!< in: PAGE_CUR_L, ... */
@@ -562,9 +568,12 @@ btr_pcur_open_on_user_rec(
BTR_MODIFY_LEAF */
btr_pcur_t* cursor, /*!< in: memory buffer for persistent
cursor */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
mtr_t* mtr) /*!< in: mtr */
{
- btr_pcur_open(index, tuple, mode, latch_mode, cursor, mtr);
+ btr_pcur_open_func(index, tuple, mode, latch_mode, cursor,
+ file, line, mtr);
if ((mode == PAGE_CUR_GE) || (mode == PAGE_CUR_G)) {
diff --git a/storage/innobase/btr/btr0sea.c b/storage/innobase/btr/btr0sea.c
index ef7afeb1039..7f8a9af1dd8 100644
--- a/storage/innobase/btr/btr0sea.c
+++ b/storage/innobase/btr/btr0sea.c
@@ -50,6 +50,11 @@ UNIV_INTERN char btr_search_enabled = TRUE;
/** Mutex protecting btr_search_enabled */
static mutex_t btr_search_enabled_mutex;
+#ifdef UNIV_PFS_MUTEX
+/* Key to register btr_search_enabled_mutex with performance schema */
+UNIV_INTERN mysql_pfs_key_t btr_search_enabled_mutex_key;
+#endif /* UNIV_PFS_MUTEX */
+
/** A dummy variable to fool the compiler */
UNIV_INTERN ulint btr_search_this_is_zero = 0;
@@ -82,6 +87,11 @@ UNIV_INTERN byte btr_sea_pad2[64];
/** The adaptive hash index */
UNIV_INTERN btr_search_sys_t* btr_search_sys;
+#ifdef UNIV_PFS_RWLOCK
+/* Key to register btr_search_sys with performance schema */
+UNIV_INTERN mysql_pfs_key_t btr_search_latch_key;
+#endif /* UNIV_PFS_RWLOCK */
+
/** If the number of records on the page divided by this parameter
would have been successfully accessed using a hash index, the index
is then built on the page, assuming the global limit has been reached */
@@ -167,8 +177,10 @@ btr_search_sys_create(
btr_search_latch_temp = mem_alloc(sizeof(rw_lock_t));
- rw_lock_create(&btr_search_latch, SYNC_SEARCH_SYS);
- mutex_create(&btr_search_enabled_mutex, SYNC_SEARCH_SYS_CONF);
+ rw_lock_create(btr_search_latch_key, &btr_search_latch,
+ SYNC_SEARCH_SYS);
+ mutex_create(btr_search_enabled_mutex_key,
+ &btr_search_enabled_mutex, SYNC_SEARCH_SYS_CONF);
btr_search_sys = mem_alloc(sizeof(btr_search_sys_t));
diff --git a/storage/innobase/buf/buf0buddy.c b/storage/innobase/buf/buf0buddy.c
index f0e1395c307..7118cb376ab 100644
--- a/storage/innobase/buf/buf0buddy.c
+++ b/storage/innobase/buf/buf0buddy.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2006, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -391,6 +391,8 @@ buf_buddy_relocate_block(
UT_LIST_ADD_FIRST(list, buf_pool->zip_clean, dpage);
}
+ UNIV_MEM_INVALID(bpage, sizeof *bpage);
+
mutex_exit(&buf_pool_zip_mutex);
return(TRUE);
}
@@ -455,6 +457,8 @@ buf_buddy_relocate(
return(FALSE);
}
+ ut_ad(!buf_pool_watch_is(bpage));
+
if (page_zip_get_size(&bpage->zip) != size) {
/* The block is of different size. We would
have to relocate all blocks covered by src.
diff --git a/storage/innobase/buf/buf0buf.c b/storage/innobase/buf/buf0buf.c
index 111d396fbc5..c4b693e3ed2 100644
--- a/storage/innobase/buf/buf0buf.c
+++ b/storage/innobase/buf/buf0buf.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
Copyright (c) 2008, Google Inc.
Portions of this file contain modifications contributed and copyrighted by
@@ -153,12 +153,12 @@ list. We also keep a pointer to near the end of the LRU list,
which we can use when we want to artificially age a page in the
buf_pool. This is used if we know that some page is not needed
again for some time: we insert the block right after the pointer,
-causing it to be replaced sooner than would noramlly be the case.
+causing it to be replaced sooner than would normally be the case.
Currently this aging mechanism is used for read-ahead mechanism
of pages, and it can also be used when there is a scan of a full
table which cannot fit in the memory. Putting the pages near the
-of the LRU list, we make sure that most of the buf_pool stays in the
-main memory, undisturbed.
+end of the LRU list, we make sure that most of the buf_pool stays
+in the main memory, undisturbed.
The unzip_LRU list contains a subset of the common LRU list. The
blocks on the unzip_LRU list hold a compressed file page and the
@@ -172,6 +172,7 @@ The chain of modified blocks (buf_pool->flush_list) contains the blocks
holding file pages that have been modified in the memory
but not written to disk yet. The block with the oldest modification
which has not yet been written to disk is at the end of the chain.
+The access to this list is protected by flush_list_mutex.
The chain of unmodified compressed blocks (buf_pool->zip_clean)
contains the control blocks (buf_page_t) of those compressed pages
@@ -242,6 +243,8 @@ the read requests for the whole area.
#ifndef UNIV_HOTBACKUP
/** Value in microseconds */
static const int WAIT_FOR_READ = 5000;
+/** Number of attemtps made to read in a page in the buffer pool */
+static const ulint BUF_PAGE_READ_MAX_RETRIES = 100;
/** The buffer buf_pool of the database */
UNIV_INTERN buf_pool_t* buf_pool = NULL;
@@ -267,6 +270,41 @@ read-ahead or flush occurs */
UNIV_INTERN ibool buf_debug_prints = FALSE;
#endif /* UNIV_DEBUG */
+#ifdef UNIV_PFS_RWLOCK
+/* Keys to register buffer block related rwlocks and mutexes with
+performance schema */
+UNIV_INTERN mysql_pfs_key_t buf_block_lock_key;
+# ifdef UNIV_SYNC_DEBUG
+UNIV_INTERN mysql_pfs_key_t buf_block_debug_latch_key;
+# endif /* UNIV_SYNC_DEBUG */
+#endif /* UNIV_PFS_RWLOCK */
+
+#ifdef UNIV_PFS_MUTEX
+UNIV_INTERN mysql_pfs_key_t buffer_block_mutex_key;
+UNIV_INTERN mysql_pfs_key_t buf_pool_mutex_key;
+UNIV_INTERN mysql_pfs_key_t buf_pool_zip_mutex_key;
+UNIV_INTERN mysql_pfs_key_t flush_list_mutex_key;
+UNIV_INTERN mysql_pfs_key_t flush_order_mutex_key;
+#endif /* UNIV_PFS_MUTEX */
+
+#if defined UNIV_PFS_MUTEX || defined UNIV_PFS_RWLOCK
+# ifndef PFS_SKIP_BUFFER_MUTEX_RWLOCK
+
+/* Buffer block mutexes and rwlocks can be registered
+in one group rather than individually. If PFS_GROUP_BUFFER_SYNC
+is defined, register buffer block mutex and rwlock
+in one group after their initialization. */
+# define PFS_GROUP_BUFFER_SYNC
+
+/* This define caps the number of mutexes/rwlocks can
+be registered with performance schema. Developers can
+modify this define if necessary. Please note, this would
+be effective only if PFS_GROUP_BUFFER_SYNC is defined. */
+# define PFS_MAX_BUFFER_MUTEX_LOCK_REGISTER ULINT_MAX
+
+# endif /* !PFS_SKIP_BUFFER_MUTEX_RWLOCK */
+#endif /* UNIV_PFS_MUTEX || UNIV_PFS_RWLOCK */
+
/** A chunk of buffers. The buffer pool is allocated in chunks. */
struct buf_chunk_struct{
ulint mem_size; /*!< allocated size of the chunk */
@@ -636,6 +674,53 @@ buf_page_print(
}
#ifndef UNIV_HOTBACKUP
+
+# ifdef PFS_GROUP_BUFFER_SYNC
+/********************************************************************//**
+This function registers mutexes and rwlocks in buffer blocks with
+performance schema. If PFS_MAX_BUFFER_MUTEX_LOCK_REGISTER is
+defined to be a value less than chunk->size, then only mutexes
+and rwlocks in the first PFS_MAX_BUFFER_MUTEX_LOCK_REGISTER
+blocks are registered. */
+static
+void
+pfs_register_buffer_block(
+/*======================*/
+ buf_chunk_t* chunk) /*!< in/out: chunk of buffers */
+{
+ ulint i;
+ ulint num_to_register;
+ buf_block_t* block;
+
+ block = chunk->blocks;
+
+ num_to_register = ut_min(chunk->size,
+ PFS_MAX_BUFFER_MUTEX_LOCK_REGISTER);
+
+ for (i = 0; i < num_to_register; i++) {
+ mutex_t* mutex;
+ rw_lock_t* rwlock;
+
+# ifdef UNIV_PFS_MUTEX
+ mutex = &block->mutex;
+ ut_a(!mutex->pfs_psi);
+ mutex->pfs_psi = (PSI_server)
+ ? PSI_server->init_mutex(buffer_block_mutex_key, mutex)
+ : NULL;
+# endif /* UNIV_PFS_MUTEX */
+
+# ifdef UNIV_PFS_RWLOCK
+ rwlock = &block->lock;
+ ut_a(!rwlock->pfs_psi);
+ rwlock->pfs_psi = (PSI_server)
+ ? PSI_server->init_rwlock(buf_block_lock_key, rwlock)
+ : NULL;
+# endif /* UNIV_PFS_RWLOCK */
+ block++;
+ }
+}
+# endif /* PFS_GROUP_BUFFER_SYNC */
+
/********************************************************************//**
Initializes a buffer control block when the buf_pool is created. */
static
@@ -675,13 +760,25 @@ buf_block_init(
#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
page_zip_des_init(&block->page.zip);
- mutex_create(&block->mutex, SYNC_BUF_BLOCK);
+#if defined PFS_SKIP_BUFFER_MUTEX_RWLOCK || defined PFS_GROUP_BUFFER_SYNC
+ /* If PFS_SKIP_BUFFER_MUTEX_RWLOCK is defined, skip registration
+ of buffer block mutex/rwlock with performance schema. If
+ PFS_GROUP_BUFFER_SYNC is defined, skip the registration
+ since buffer block mutex/rwlock will be registered later in
+ pfs_register_buffer_block() */
+
+ mutex_create(PFS_NOT_INSTRUMENTED, &block->mutex, SYNC_BUF_BLOCK);
+ rw_lock_create(PFS_NOT_INSTRUMENTED, &block->lock, SYNC_LEVEL_VARYING);
+#else /* PFS_SKIP_BUFFER_MUTEX_RWLOCK || PFS_GROUP_BUFFER_SYNC */
+ mutex_create(buffer_block_mutex_key, &block->mutex, SYNC_BUF_BLOCK);
+ rw_lock_create(buf_block_lock_key, &block->lock, SYNC_LEVEL_VARYING);
+#endif /* PFS_SKIP_BUFFER_MUTEX_RWLOCK || PFS_GROUP_BUFFER_SYNC */
- rw_lock_create(&block->lock, SYNC_LEVEL_VARYING);
ut_ad(rw_lock_validate(&(block->lock)));
#ifdef UNIV_SYNC_DEBUG
- rw_lock_create(&block->debug_latch, SYNC_NO_ORDER_CHECK);
+ rw_lock_create(buf_block_debug_latch_key,
+ &block->debug_latch, SYNC_NO_ORDER_CHECK);
#endif /* UNIV_SYNC_DEBUG */
}
@@ -761,6 +858,9 @@ buf_chunk_init(
frame += UNIV_PAGE_SIZE;
}
+#ifdef PFS_GROUP_BUFFER_SYNC
+ pfs_register_buffer_block(chunk);
+#endif
return(chunk);
}
@@ -952,8 +1052,10 @@ buf_pool_init(void)
/* 1. Initialize general fields
------------------------------- */
- mutex_create(&buf_pool_mutex, SYNC_BUF_POOL);
- mutex_create(&buf_pool_zip_mutex, SYNC_BUF_BLOCK);
+ mutex_create(buf_pool_mutex_key,
+ &buf_pool_mutex, SYNC_BUF_POOL);
+ mutex_create(buf_pool_zip_mutex_key,
+ &buf_pool_zip_mutex, SYNC_BUF_BLOCK);
buf_pool_mutex_enter();
@@ -981,6 +1083,11 @@ buf_pool_init(void)
/* 2. Initialize flushing fields
-------------------------------- */
+ mutex_create(flush_list_mutex_key, &buf_pool->flush_list_mutex,
+ SYNC_BUF_FLUSH_LIST);
+ mutex_create(flush_order_mutex_key, &buf_pool->flush_order_mutex,
+ SYNC_BUF_FLUSH_ORDER);
+
for (i = BUF_FLUSH_LRU; i < BUF_FLUSH_N_TYPES; i++) {
buf_pool->no_flush[i] = os_event_create(NULL);
}
@@ -1058,7 +1165,9 @@ buf_pool_drop_hash_index(void)
when we have an x-latch on btr_search_latch;
see the comment in buf0buf.h */
- if (!block->is_hashed) {
+ if (buf_block_get_state(block)
+ != BUF_BLOCK_FILE_PAGE
+ || !block->is_hashed) {
continue;
}
@@ -1132,6 +1241,7 @@ buf_relocate(
ut_ad(!bpage->in_zip_hash);
ut_ad(bpage->in_page_hash);
ut_ad(bpage == buf_page_hash_get(bpage->space, bpage->offset));
+ ut_ad(!buf_pool_watch_is(bpage));
#ifdef UNIV_DEBUG
switch (buf_page_get_state(bpage)) {
case BUF_BLOCK_ZIP_FREE:
@@ -1187,8 +1297,6 @@ buf_relocate(
HASH_DELETE(buf_page_t, hash, buf_pool->page_hash, fold, bpage);
HASH_INSERT(buf_page_t, hash, buf_pool->page_hash, fold, dpage);
-
- UNIV_MEM_INVALID(bpage, sizeof *bpage);
}
/********************************************************************//**
@@ -1409,6 +1517,7 @@ buf_pool_page_hash_rebuild(void)
buf_page_address_fold(b->space, b->offset), b);
}
+ buf_flush_list_mutex_enter();
for (b = UT_LIST_GET_FIRST(buf_pool->flush_list); b;
b = UT_LIST_GET_NEXT(list, b)) {
ut_ad(b->in_flush_list);
@@ -1436,6 +1545,7 @@ buf_pool_page_hash_rebuild(void)
}
}
+ buf_flush_list_mutex_exit();
buf_pool_mutex_exit();
}
@@ -1496,6 +1606,199 @@ buf_pool_resize(void)
buf_pool_page_hash_rebuild();
}
+/** Maximum number of concurrent buffer pool watches */
+#define BUF_POOL_WATCH_SIZE 1
+/** Sentinel records for buffer pool watches. Protected by buf_pool_mutex. */
+static buf_page_t buf_pool_watch[BUF_POOL_WATCH_SIZE];
+
+/********************************************************************
+Determine if a block is a sentinel for a buffer pool watch.
+@return TRUE if a sentinel for a buffer pool watch, FALSE if not */
+UNIV_INTERN
+ibool
+buf_pool_watch_is(
+/*==============*/
+ const buf_page_t* bpage) /*!< in: block */
+{
+ ut_ad(buf_page_in_file(bpage));
+
+ if (UNIV_LIKELY(bpage < &buf_pool_watch[0]
+ || bpage >= &buf_pool_watch[BUF_POOL_WATCH_SIZE])) {
+
+ ut_ad(buf_page_get_state(bpage) != BUF_BLOCK_ZIP_PAGE
+ || bpage->zip.data != NULL);
+
+ return(FALSE);
+ }
+
+ ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_PAGE);
+ ut_ad(!bpage->in_zip_hash);
+ ut_ad(bpage->in_page_hash);
+ ut_ad(bpage->zip.data == NULL);
+ ut_ad(bpage->buf_fix_count > 0);
+ return(TRUE);
+}
+
+/****************************************************************//**
+Add watch for the given page to be read in. Caller must have the buffer pool
+mutex reserved.
+@return NULL if watch set, block if the page is in the buffer pool */
+UNIV_INTERN
+buf_page_t*
+buf_pool_watch_set(
+/*===============*/
+ ulint space, /*!< in: space id */
+ ulint offset, /*!< in: page number */
+ ulint fold) /*!< in: buf_page_address_fold(space, offset) */
+{
+ buf_page_t* bpage;
+ ulint i;
+
+ ut_ad(buf_pool_mutex_own());
+
+ bpage = buf_page_hash_get_low(space, offset, fold);
+
+ if (UNIV_LIKELY_NULL(bpage)) {
+ if (!buf_pool_watch_is(bpage)) {
+ /* The page was loaded meanwhile. */
+ return(bpage);
+ }
+ /* Add to an existing watch. */
+ bpage->buf_fix_count++;
+ return(NULL);
+ }
+
+ for (i = 0; i < BUF_POOL_WATCH_SIZE; i++) {
+ bpage = &buf_pool_watch[i];
+
+ ut_ad(bpage->access_time == 0);
+ ut_ad(bpage->newest_modification == 0);
+ ut_ad(bpage->oldest_modification == 0);
+ ut_ad(bpage->zip.data == NULL);
+ ut_ad(!bpage->in_zip_hash);
+
+ switch (bpage->state) {
+ case BUF_BLOCK_POOL_WATCH:
+ ut_ad(!bpage->in_page_hash);
+ ut_ad(bpage->buf_fix_count == 0);
+
+ /* bpage is pointing to buf_pool_watch[],
+ which is protected by buf_pool_mutex.
+ Normally, buf_page_t objects are protected by
+ buf_block_t::mutex or buf_pool_zip_mutex or both. */
+
+ bpage->state = BUF_BLOCK_ZIP_PAGE;
+ bpage->space = space;
+ bpage->offset = offset;
+ bpage->buf_fix_count = 1;
+
+ ut_d(bpage->in_page_hash = TRUE);
+ HASH_INSERT(buf_page_t, hash, buf_pool->page_hash,
+ fold, bpage);
+ return(NULL);
+ case BUF_BLOCK_ZIP_PAGE:
+ ut_ad(bpage->in_page_hash);
+ ut_ad(bpage->buf_fix_count > 0);
+ break;
+ default:
+ ut_error;
+ }
+ }
+
+ /* Allocation failed. Either the maximum number of purge
+ threads should never exceed BUF_POOL_WATCH_SIZE, or this code
+ should be modified to return a special non-NULL value and the
+ caller should purge the record directly. */
+ ut_error;
+
+ /* Fix compiler warning */
+ return(NULL);
+}
+
+/****************************************************************//**
+Remove the sentinel block for the watch before replacing it with a real block.
+buf_page_watch_clear() or buf_page_watch_occurred() will notice that
+the block has been replaced with the real block.
+@return reference count, to be added to the replacement block */
+static
+void
+buf_pool_watch_remove(
+/*==================*/
+ ulint fold, /*!< in: buf_page_address_fold(space, offset) */
+ buf_page_t* watch) /*!< in/out: sentinel for watch */
+{
+ ut_ad(buf_pool_mutex_own());
+
+ HASH_DELETE(buf_page_t, hash, buf_pool->page_hash, fold, watch);
+ ut_d(watch->in_page_hash = FALSE);
+ watch->buf_fix_count = 0;
+ watch->state = BUF_BLOCK_POOL_WATCH;
+}
+
+/****************************************************************//**
+Stop watching if the page has been read in.
+buf_pool_watch_set(space,offset) must have returned NULL before. */
+UNIV_INTERN
+void
+buf_pool_watch_unset(
+/*=================*/
+ ulint space, /*!< in: space id */
+ ulint offset) /*!< in: page number */
+{
+ buf_page_t* bpage;
+ ulint fold = buf_page_address_fold(space, offset);
+
+ buf_pool_mutex_enter();
+ bpage = buf_page_hash_get_low(space, offset, fold);
+ /* The page must exist because buf_pool_watch_set()
+ increments buf_fix_count. */
+ ut_a(bpage);
+
+ if (UNIV_UNLIKELY(!buf_pool_watch_is(bpage))) {
+ mutex_t* mutex = buf_page_get_mutex(bpage);
+ mutex_enter(mutex);
+ ut_a(bpage->buf_fix_count > 0);
+ bpage->buf_fix_count--;
+ mutex_exit(mutex);
+ } else {
+ ut_a(bpage->buf_fix_count > 0);
+
+ if (UNIV_LIKELY(!--bpage->buf_fix_count)) {
+ buf_pool_watch_remove(fold, bpage);
+ }
+ }
+
+ buf_pool_mutex_exit();
+}
+
+/****************************************************************//**
+Check if the page has been read in.
+This may only be called after buf_pool_watch_set(space,offset)
+has returned NULL and before invoking buf_pool_watch_unset(space,offset).
+@return FALSE if the given page was not read in, TRUE if it was */
+UNIV_INTERN
+ibool
+buf_pool_watch_occurred(
+/*====================*/
+ ulint space, /*!< in: space id */
+ ulint offset) /*!< in: page number */
+{
+ buf_page_t* bpage;
+ ulint fold = buf_page_address_fold(space, offset);
+ ibool ret;
+
+ buf_pool_mutex_enter();
+
+ bpage = buf_page_hash_get_low(space, offset, fold);
+ /* The page must exist because buf_pool_watch_set()
+ increments buf_fix_count. */
+ ut_a(bpage);
+ ret = !buf_pool_watch_is(bpage);
+ buf_pool_mutex_exit();
+
+ return(ret);
+}
+
/********************************************************************//**
Moves a page to the start of the buffer pool LRU list. This high-level
function can be used to prevent an important page from slipping out of
@@ -1562,6 +1865,7 @@ buf_reset_check_index_page_at_flush(
block = (buf_block_t*) buf_page_hash_get(space, offset);
if (block && buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE) {
+ ut_ad(!buf_pool_watch_is(&block->page));
block->check_index_page_at_flush = FALSE;
}
@@ -1590,6 +1894,7 @@ buf_page_peek_if_search_hashed(
if (!block || buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE) {
is_hashed = FALSE;
} else {
+ ut_ad(!buf_pool_watch_is(&block->page));
is_hashed = block->is_hashed;
}
@@ -1619,6 +1924,7 @@ buf_page_set_file_page_was_freed(
bpage = buf_page_hash_get(space, offset);
if (bpage) {
+ ut_ad(!buf_pool_watch_is(bpage));
bpage->file_page_was_freed = TRUE;
}
@@ -1647,6 +1953,7 @@ buf_page_reset_file_page_was_freed(
bpage = buf_page_hash_get(space, offset);
if (bpage) {
+ ut_ad(!buf_pool_watch_is(bpage));
bpage->file_page_was_freed = FALSE;
}
@@ -1688,6 +1995,7 @@ buf_page_get_zip(
lookup:
bpage = buf_page_hash_get(space, offset);
if (bpage) {
+ ut_ad(!buf_pool_watch_is(bpage));
break;
}
@@ -1709,6 +2017,8 @@ err_exit:
return(NULL);
}
+ ut_ad(!buf_pool_watch_is(bpage));
+
switch (buf_page_get_state(bpage)) {
case BUF_BLOCK_NOT_USED:
case BUF_BLOCK_READY_FOR_USE:
@@ -2025,29 +2335,36 @@ buf_page_get_gen(
ulint rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH */
buf_block_t* guess, /*!< in: guessed block or NULL */
ulint mode, /*!< in: BUF_GET, BUF_GET_IF_IN_POOL,
- BUF_GET_NO_LATCH */
+ BUF_GET_NO_LATCH, or
+ BUF_GET_IF_IN_POOL_OR_WATCH */
const char* file, /*!< in: file name */
ulint line, /*!< in: line where called */
mtr_t* mtr) /*!< in: mini-transaction */
{
buf_block_t* block;
+ ulint fold;
unsigned access_time;
ulint fix_type;
ibool must_read;
+ ulint retries = 0;
ut_ad(mtr);
+ ut_ad(mtr->state == MTR_ACTIVE);
ut_ad((rw_latch == RW_S_LATCH)
|| (rw_latch == RW_X_LATCH)
|| (rw_latch == RW_NO_LATCH));
ut_ad((mode != BUF_GET_NO_LATCH) || (rw_latch == RW_NO_LATCH));
- ut_ad((mode == BUF_GET) || (mode == BUF_GET_IF_IN_POOL)
- || (mode == BUF_GET_NO_LATCH));
+ ut_ad(mode == BUF_GET
+ || mode == BUF_GET_IF_IN_POOL
+ || mode == BUF_GET_NO_LATCH
+ || mode == BUF_GET_IF_IN_POOL_OR_WATCH);
ut_ad(zip_size == fil_space_get_zip_size(space));
ut_ad(ut_is_2pow(zip_size));
#ifndef UNIV_LOG_DEBUG
ut_ad(!ibuf_inside() || ibuf_page(space, zip_size, offset, NULL));
#endif
buf_pool->stat.n_page_gets++;
+ fold = buf_page_address_fold(space, offset);
loop:
block = guess;
buf_pool_mutex_enter();
@@ -2074,21 +2391,59 @@ loop:
}
if (block == NULL) {
- block = (buf_block_t*) buf_page_hash_get(space, offset);
+ block = (buf_block_t*) buf_page_hash_get_low(space, offset,
+ fold);
}
loop2:
+ if (block && buf_pool_watch_is(&block->page)) {
+ block = NULL;
+ }
+
if (block == NULL) {
/* Page not in buf_pool: needs to be read from file */
+ if (mode == BUF_GET_IF_IN_POOL_OR_WATCH) {
+ block = (buf_block_t*) buf_pool_watch_set(
+ space, offset, fold);
+
+ if (UNIV_LIKELY_NULL(block)) {
+
+ goto got_block;
+ }
+ }
+
buf_pool_mutex_exit();
- if (mode == BUF_GET_IF_IN_POOL) {
+ if (mode == BUF_GET_IF_IN_POOL
+ || mode == BUF_GET_IF_IN_POOL_OR_WATCH) {
return(NULL);
}
- buf_read_page(space, zip_size, offset);
+ if (buf_read_page(space, zip_size, offset)) {
+ retries = 0;
+ } else if (retries < BUF_PAGE_READ_MAX_RETRIES) {
+ ++retries;
+ } else {
+ fprintf(stderr, "InnoDB: Error: Unable"
+ " to read tablespace %lu page no"
+ " %lu into the buffer pool after"
+ " %lu attempts\n"
+ "InnoDB: The most probable cause"
+ " of this error may be that the"
+ " table has been corrupted.\n"
+ "InnoDB: You can try to fix this"
+ " problem by using"
+ " innodb_force_recovery.\n"
+ "InnoDB: Please see reference manual"
+ " for more details.\n"
+ "InnoDB: Aborting...\n",
+ space, offset,
+ BUF_PAGE_READ_MAX_RETRIES);
+
+ ut_error;
+ }
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
ut_a(++buf_dbg_counter % 37 || buf_validate());
@@ -2096,12 +2451,16 @@ loop2:
goto loop;
}
+got_block:
ut_ad(page_zip_get_size(&block->page.zip) == zip_size);
must_read = buf_block_get_io_fix(block) == BUF_IO_READ;
if (must_read && mode == BUF_GET_IF_IN_POOL) {
- /* The page is only being read to buffer */
+
+ /* The page is being read to buffer pool,
+ but we cannot wait around for the read to
+ complete. */
buf_pool_mutex_exit();
return(NULL);
@@ -2147,7 +2506,7 @@ wait_until_unfixed:
{
buf_page_t* hash_bpage
- = buf_page_hash_get(space, offset);
+ = buf_page_hash_get_low(space, offset, fold);
if (UNIV_UNLIKELY(bpage != hash_bpage)) {
/* The buf_pool->page_hash was modified
@@ -2196,22 +2555,8 @@ wait_until_unfixed:
ut_ad(!block->page.in_flush_list);
} else {
/* Relocate buf_pool->flush_list. */
- buf_page_t* b;
-
- b = UT_LIST_GET_PREV(list, &block->page);
- ut_ad(block->page.in_flush_list);
- UT_LIST_REMOVE(list, buf_pool->flush_list,
- &block->page);
-
- if (b) {
- UT_LIST_INSERT_AFTER(
- list, buf_pool->flush_list, b,
- &block->page);
- } else {
- UT_LIST_ADD_FIRST(
- list, buf_pool->flush_list,
- &block->page);
- }
+ buf_flush_relocate_on_flush_list(bpage,
+ &block->page);
}
/* Buffer-fix, I/O-fix, and X-latch the block
@@ -2225,6 +2570,9 @@ wait_until_unfixed:
block->page.buf_fix_count = 1;
buf_block_set_io_fix(block, BUF_IO_READ);
rw_lock_x_lock(&block->lock);
+
+ UNIV_MEM_INVALID(bpage, sizeof *bpage);
+
mutex_exit(&block->mutex);
mutex_exit(&buf_pool_zip_mutex);
buf_pool->n_pend_unzip++;
@@ -2237,7 +2585,7 @@ wait_until_unfixed:
while not holding buf_pool_mutex or block->mutex. */
success = buf_zip_decompress(block, srv_use_checksums);
- if (UNIV_LIKELY(success)) {
+ if (UNIV_LIKELY(success && !recv_no_ibuf_operations)) {
ibuf_merge_or_delete_for_page(block, space, offset,
zip_size, TRUE);
}
@@ -2356,8 +2704,8 @@ page.
@return TRUE if success */
UNIV_INTERN
ibool
-buf_page_optimistic_get_func(
-/*=========================*/
+buf_page_optimistic_get(
+/*====================*/
ulint rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH */
buf_block_t* block, /*!< in: guessed buffer block */
ib_uint64_t modify_clock,/*!< in: modify clock value if mode is
@@ -2370,7 +2718,9 @@ buf_page_optimistic_get_func(
ibool success;
ulint fix_type;
- ut_ad(mtr && block);
+ ut_ad(block);
+ ut_ad(mtr);
+ ut_ad(mtr->state == MTR_ACTIVE);
ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH));
mutex_enter(&block->mutex);
@@ -2482,6 +2832,7 @@ buf_page_get_known_nowait(
ulint fix_type;
ut_ad(mtr);
+ ut_ad(mtr->state == MTR_ACTIVE);
ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH));
mutex_enter(&block->mutex);
@@ -2581,14 +2932,19 @@ buf_page_try_get_func(
ibool success;
ulint fix_type;
+ ut_ad(mtr);
+ ut_ad(mtr->state == MTR_ACTIVE);
+
buf_pool_mutex_enter();
block = buf_block_hash_get(space_id, page_no);
- if (!block) {
+ if (!block || buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE) {
buf_pool_mutex_exit();
return(NULL);
}
+ ut_ad(!buf_pool_watch_is(&block->page));
+
mutex_enter(&block->mutex);
buf_pool_mutex_exit();
@@ -2673,6 +3029,7 @@ buf_page_init(
ulint space, /*!< in: space id */
ulint offset, /*!< in: offset of the page within space
in units of a page */
+ ulint fold, /*!< in: buf_page_address_fold(space,offset) */
buf_block_t* block) /*!< in: block to init */
{
buf_page_t* hash_page;
@@ -2697,11 +3054,20 @@ buf_page_init(
block->lock_hash_val = lock_rec_hash(space, offset);
+ buf_page_init_low(&block->page);
+
/* Insert into the hash table of file pages */
- hash_page = buf_page_hash_get(space, offset);
+ hash_page = buf_page_hash_get_low(space, offset, fold);
- if (UNIV_LIKELY_NULL(hash_page)) {
+ if (UNIV_LIKELY(!hash_page)) {
+ } else if (UNIV_LIKELY(buf_pool_watch_is(hash_page))) {
+ /* Preserve the reference count. */
+ ulint buf_fix_count = hash_page->buf_fix_count;
+ ut_a(buf_fix_count > 0);
+ block->page.buf_fix_count += buf_fix_count;
+ buf_pool_watch_remove(fold, hash_page);
+ } else {
fprintf(stderr,
"InnoDB: Error: page %lu %lu already found"
" in the hash table: %p, %p\n",
@@ -2719,13 +3085,11 @@ buf_page_init(
ut_error;
}
- buf_page_init_low(&block->page);
-
ut_ad(!block->page.in_zip_hash);
ut_ad(!block->page.in_page_hash);
ut_d(block->page.in_page_hash = TRUE);
HASH_INSERT(buf_page_t, hash, buf_pool->page_hash,
- buf_page_address_fold(space, offset), &block->page);
+ fold, &block->page);
}
/********************************************************************//**
@@ -2753,8 +3117,10 @@ buf_page_init_for_read(
ulint offset) /*!< in: page number */
{
buf_block_t* block;
- buf_page_t* bpage;
+ buf_page_t* bpage = NULL;
+ buf_page_t* watch_page;
mtr_t mtr;
+ ulint fold;
ibool lru = FALSE;
void* data;
@@ -2789,10 +3155,14 @@ buf_page_init_for_read(
ut_ad(block);
}
+ fold = buf_page_address_fold(space, offset);
+
buf_pool_mutex_enter();
- if (buf_page_hash_get(space, offset)) {
+ watch_page = buf_page_hash_get_low(space, offset, fold);
+ if (watch_page && !buf_pool_watch_is(watch_page)) {
/* The page is already in the buffer pool. */
+ watch_page = NULL;
err_exit:
if (block) {
mutex_enter(&block->mutex);
@@ -2816,7 +3186,8 @@ err_exit:
if (block) {
bpage = &block->page;
mutex_enter(&block->mutex);
- buf_page_init(space, offset, block);
+
+ buf_page_init(space, offset, fold, block);
/* The block must be put to the LRU list, to the old blocks */
buf_LRU_add_block(bpage, TRUE/* to old blocks */);
@@ -2875,15 +3246,19 @@ err_exit:
/* If buf_buddy_alloc() allocated storage from the LRU list,
it released and reacquired buf_pool_mutex. Thus, we must
check the page_hash again, as it may have been modified. */
- if (UNIV_UNLIKELY(lru)
- && UNIV_LIKELY_NULL(buf_page_hash_get(space, offset))) {
-
- /* The block was added by some other thread. */
- buf_buddy_free(bpage, sizeof *bpage);
- buf_buddy_free(data, zip_size);
-
- bpage = NULL;
- goto func_exit;
+ if (UNIV_UNLIKELY(lru)) {
+ watch_page = buf_page_hash_get_low(space, offset, fold);
+ if (UNIV_UNLIKELY
+ (watch_page && !buf_pool_watch_is(watch_page))) {
+
+ /* The block was added by some other thread. */
+ watch_page = NULL;
+ buf_buddy_free(bpage, sizeof *bpage);
+ buf_buddy_free(data, zip_size);
+
+ bpage = NULL;
+ goto func_exit;
+ }
}
page_zip_des_init(&bpage->zip);
@@ -2893,11 +3268,14 @@ err_exit:
mutex_enter(&buf_pool_zip_mutex);
UNIV_MEM_DESC(bpage->zip.data,
page_zip_get_size(&bpage->zip), bpage);
+
buf_page_init_low(bpage);
+
bpage->state = BUF_BLOCK_ZIP_PAGE;
bpage->space = space;
bpage->offset = offset;
+
#ifdef UNIV_DEBUG
bpage->in_page_hash = FALSE;
bpage->in_zip_hash = FALSE;
@@ -2907,8 +3285,18 @@ err_exit:
#endif /* UNIV_DEBUG */
ut_d(bpage->in_page_hash = TRUE);
- HASH_INSERT(buf_page_t, hash, buf_pool->page_hash,
- buf_page_address_fold(space, offset), bpage);
+
+ if (UNIV_LIKELY_NULL(watch_page)) {
+ /* Preserve the reference count. */
+ ulint buf_fix_count = watch_page->buf_fix_count;
+ ut_a(buf_fix_count > 0);
+ bpage->buf_fix_count += buf_fix_count;
+ ut_ad(buf_pool_watch_is(watch_page));
+ buf_pool_watch_remove(fold, watch_page);
+ }
+
+ HASH_INSERT(buf_page_t, hash, buf_pool->page_hash, fold,
+ bpage);
/* The block must be put to the LRU list, to the old blocks */
buf_LRU_add_block(bpage, TRUE/* to old blocks */);
@@ -2952,17 +3340,22 @@ buf_page_create(
buf_block_t* block;
buf_block_t* free_block = NULL;
ulint time_ms = ut_time_ms();
+ ulint fold;
ut_ad(mtr);
+ ut_ad(mtr->state == MTR_ACTIVE);
ut_ad(space || !zip_size);
free_block = buf_LRU_get_free_block(0);
+ fold = buf_page_address_fold(space, offset);
+
buf_pool_mutex_enter();
- block = (buf_block_t*) buf_page_hash_get(space, offset);
+ block = (buf_block_t*) buf_page_hash_get_low(space, offset, fold);
- if (block && buf_page_in_file(&block->page)) {
+ if (block && buf_page_in_file(&block->page)
+ && !buf_pool_watch_is(&block->page)) {
#ifdef UNIV_IBUF_COUNT_DEBUG
ut_a(ibuf_count_get(space, offset) == 0);
#endif
@@ -2992,7 +3385,7 @@ buf_page_create(
mutex_enter(&block->mutex);
- buf_page_init(space, offset, block);
+ buf_page_init(space, offset, fold, block);
/* The block must be put to the LRU list */
buf_LRU_add_block(&block->page, FALSE);
@@ -3441,11 +3834,6 @@ buf_validate(void)
}
n_lru++;
-
- if (block->page.oldest_modification > 0) {
- n_flush++;
- }
-
break;
case BUF_BLOCK_NOT_USED:
@@ -3484,6 +3872,10 @@ buf_validate(void)
ut_error;
break;
}
+
+ /* It is OK to read oldest_modification here because
+ we have acquired buf_pool_zip_mutex above which acts
+ as the 'block->mutex' for these bpages. */
ut_a(!b->oldest_modification);
ut_a(buf_page_hash_get(b->space, b->offset) == b);
@@ -3491,23 +3883,23 @@ buf_validate(void)
n_zip++;
}
- /* Check dirty compressed-only blocks. */
+ /* Check dirty blocks. */
+ buf_flush_list_mutex_enter();
for (b = UT_LIST_GET_FIRST(buf_pool->flush_list); b;
b = UT_LIST_GET_NEXT(list, b)) {
ut_ad(b->in_flush_list);
+ ut_a(b->oldest_modification);
+ n_flush++;
switch (buf_page_get_state(b)) {
case BUF_BLOCK_ZIP_DIRTY:
- ut_a(b->oldest_modification);
n_lru++;
- n_flush++;
n_zip++;
switch (buf_page_get_io_fix(b)) {
case BUF_IO_NONE:
case BUF_IO_READ:
break;
-
case BUF_IO_WRITE:
switch (buf_page_get_flush_type(b)) {
case BUF_FLUSH_LRU:
@@ -3540,6 +3932,10 @@ buf_validate(void)
ut_a(buf_page_hash_get(b->space, b->offset) == b);
}
+ ut_a(UT_LIST_GET_LEN(buf_pool->flush_list) == n_flush);
+
+ buf_flush_list_mutex_exit();
+
mutex_exit(&buf_pool_zip_mutex);
if (n_lru + n_free > buf_pool->curr_size + n_zip) {
@@ -3556,7 +3952,6 @@ buf_validate(void)
(ulong) n_free);
ut_error;
}
- ut_a(UT_LIST_GET_LEN(buf_pool->flush_list) == n_flush);
ut_a(buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE] == n_single_flush);
ut_a(buf_pool->n_flush[BUF_FLUSH_LIST] == n_list_flush);
@@ -3597,6 +3992,7 @@ buf_print(void)
counts = mem_alloc(sizeof(ulint) * size);
buf_pool_mutex_enter();
+ buf_flush_list_mutex_enter();
fprintf(stderr,
"buf_pool size %lu\n"
@@ -3623,6 +4019,8 @@ buf_print(void)
(ulong) buf_pool->stat.n_pages_created,
(ulong) buf_pool->stat.n_pages_written);
+ buf_flush_list_mutex_exit();
+
/* Count the number of blocks belonging to each index in the buffer */
n_found = 0;
@@ -3746,6 +4144,7 @@ buf_get_latched_pages_number(void)
}
}
+ buf_flush_list_mutex_enter();
for (b = UT_LIST_GET_FIRST(buf_pool->flush_list); b;
b = UT_LIST_GET_NEXT(list, b)) {
ut_ad(b->in_flush_list);
@@ -3771,6 +4170,7 @@ buf_get_latched_pages_number(void)
}
}
+ buf_flush_list_mutex_exit();
mutex_exit(&buf_pool_zip_mutex);
buf_pool_mutex_exit();
@@ -3803,16 +4203,13 @@ buf_get_modified_ratio_pct(void)
{
ulint ratio;
- buf_pool_mutex_enter();
-
+ /* This is for heuristics. No need to grab any mutex here. */
ratio = (100 * UT_LIST_GET_LEN(buf_pool->flush_list))
/ (1 + UT_LIST_GET_LEN(buf_pool->LRU)
+ UT_LIST_GET_LEN(buf_pool->free));
/* 1 + is there to avoid division by zero */
- buf_pool_mutex_exit();
-
return(ratio);
}
@@ -3831,6 +4228,7 @@ buf_print_io(
ut_ad(buf_pool);
buf_pool_mutex_enter();
+ buf_flush_list_mutex_enter();
fprintf(file,
"Buffer pool size %lu\n"
@@ -3852,6 +4250,8 @@ buf_print_io(
+ buf_pool->init_flush[BUF_FLUSH_LIST],
(ulong) buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]);
+ buf_flush_list_mutex_exit();
+
current_time = time(NULL);
time_elapsed = 0.001 + difftime(current_time,
buf_pool->last_printout_time);
diff --git a/storage/innobase/buf/buf0flu.c b/storage/innobase/buf/buf0flu.c
index 8b614ce90e5..847f8dd9452 100644
--- a/storage/innobase/buf/buf0flu.c
+++ b/storage/innobase/buf/buf0flu.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -87,30 +87,176 @@ buf_flush_validate_low(void);
/*========================*/
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+/******************************************************************//**
+Insert a block in the flush_rbt and returns a pointer to its
+predecessor or NULL if no predecessor. The ordering is maintained
+on the basis of the <oldest_modification, space, offset> key.
+@return pointer to the predecessor or NULL if no predecessor. */
+static
+buf_page_t*
+buf_flush_insert_in_flush_rbt(
+/*==========================*/
+ buf_page_t* bpage) /*!< in: bpage to be inserted. */
+{
+ buf_page_t* prev = NULL;
+ const ib_rbt_node_t* c_node;
+ const ib_rbt_node_t* p_node;
+
+ ut_ad(buf_flush_list_mutex_own());
+
+ /* Insert this buffer into the rbt. */
+ c_node = rbt_insert(buf_pool->flush_rbt, &bpage, &bpage);
+ ut_a(c_node != NULL);
+
+ /* Get the predecessor. */
+ p_node = rbt_prev(buf_pool->flush_rbt, c_node);
+
+ if (p_node != NULL) {
+ prev = *rbt_value(buf_page_t*, p_node);
+ ut_a(prev != NULL);
+ }
+
+ return(prev);
+}
+
+/*********************************************************//**
+Delete a bpage from the flush_rbt. */
+static
+void
+buf_flush_delete_from_flush_rbt(
+/*============================*/
+ buf_page_t* bpage) /*!< in: bpage to be removed. */
+{
+
+ ibool ret = FALSE;
+
+ ut_ad(buf_flush_list_mutex_own());
+
+ ret = rbt_delete(buf_pool->flush_rbt, &bpage);
+ ut_ad(ret);
+}
+
+/*****************************************************************//**
+Compare two modified blocks in the buffer pool. The key for comparison
+is:
+key = <oldest_modification, space, offset>
+This comparison is used to maintian ordering of blocks in the
+buf_pool->flush_rbt.
+Note that for the purpose of flush_rbt, we only need to order blocks
+on the oldest_modification. The other two fields are used to uniquely
+identify the blocks.
+@return < 0 if b2 < b1, 0 if b2 == b1, > 0 if b2 > b1 */
+static
+int
+buf_flush_block_cmp(
+/*================*/
+ const void* p1, /*!< in: block1 */
+ const void* p2) /*!< in: block2 */
+{
+ int ret;
+ const buf_page_t* b1 = *(const buf_page_t**) p1;
+ const buf_page_t* b2 = *(const buf_page_t**) p2;
+
+ ut_ad(b1 != NULL);
+ ut_ad(b2 != NULL);
+
+ ut_ad(buf_flush_list_mutex_own());
+
+ ut_ad(b1->in_flush_list);
+ ut_ad(b2->in_flush_list);
+
+ if (b2->oldest_modification
+ > b1->oldest_modification) {
+ return(1);
+ }
+
+ if (b2->oldest_modification
+ < b1->oldest_modification) {
+ return(-1);
+ }
+
+ /* If oldest_modification is same then decide on the space. */
+ ret = (int)(b2->space - b1->space);
+
+ /* Or else decide ordering on the offset field. */
+ return(ret ? ret : (int)(b2->offset - b1->offset));
+}
+
+/********************************************************************//**
+Initialize the red-black tree to speed up insertions into the flush_list
+during recovery process. Should be called at the start of recovery
+process before any page has been read/written. */
+UNIV_INTERN
+void
+buf_flush_init_flush_rbt(void)
+/*==========================*/
+{
+ buf_flush_list_mutex_enter();
+
+ /* Create red black tree for speedy insertions in flush list. */
+ buf_pool->flush_rbt = rbt_create(sizeof(buf_page_t*),
+ buf_flush_block_cmp);
+ buf_flush_list_mutex_exit();
+}
+
+/********************************************************************//**
+Frees up the red-black tree. */
+UNIV_INTERN
+void
+buf_flush_free_flush_rbt(void)
+/*==========================*/
+{
+ buf_flush_list_mutex_enter();
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+ ut_a(buf_flush_validate_low());
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+ rbt_free(buf_pool->flush_rbt);
+ buf_pool->flush_rbt = NULL;
+
+ buf_flush_list_mutex_exit();
+}
+
/********************************************************************//**
Inserts a modified block into the flush list. */
UNIV_INTERN
void
buf_flush_insert_into_flush_list(
/*=============================*/
- buf_block_t* block) /*!< in/out: block which is modified */
+ buf_block_t* block, /*!< in/out: block which is modified */
+ ib_uint64_t lsn) /*!< in: oldest modification */
{
- ut_ad(buf_pool_mutex_own());
+ ut_ad(!buf_pool_mutex_own());
+ ut_ad(buf_flush_order_mutex_own());
+ ut_ad(mutex_own(&block->mutex));
+
+ buf_flush_list_mutex_enter();
+
ut_ad((UT_LIST_GET_FIRST(buf_pool->flush_list) == NULL)
|| (UT_LIST_GET_FIRST(buf_pool->flush_list)->oldest_modification
- <= block->page.oldest_modification));
+ <= lsn));
+
+ /* If we are in the recovery then we need to update the flush
+ red-black tree as well. */
+ if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
+ buf_flush_list_mutex_exit();
+ buf_flush_insert_sorted_into_flush_list(block, lsn);
+ return;
+ }
ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
- ut_ad(block->page.in_LRU_list);
- ut_ad(block->page.in_page_hash);
- ut_ad(!block->page.in_zip_hash);
ut_ad(!block->page.in_flush_list);
+
ut_d(block->page.in_flush_list = TRUE);
+ block->page.oldest_modification = lsn;
UT_LIST_ADD_FIRST(list, buf_pool->flush_list, &block->page);
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
ut_a(buf_flush_validate_low());
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+ buf_flush_list_mutex_exit();
}
/********************************************************************//**
@@ -121,27 +267,61 @@ UNIV_INTERN
void
buf_flush_insert_sorted_into_flush_list(
/*====================================*/
- buf_block_t* block) /*!< in/out: block which is modified */
+ buf_block_t* block, /*!< in/out: block which is modified */
+ ib_uint64_t lsn) /*!< in: oldest modification */
{
buf_page_t* prev_b;
buf_page_t* b;
- ut_ad(buf_pool_mutex_own());
+ ut_ad(!buf_pool_mutex_own());
+ ut_ad(buf_flush_order_mutex_own());
+ ut_ad(mutex_own(&block->mutex));
ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+ buf_flush_list_mutex_enter();
+
+ /* The field in_LRU_list is protected by buf_pool_mutex, which
+ we are not holding. However, while a block is in the flush
+ list, it is dirty and cannot be discarded, not from the
+ page_hash or from the LRU list. At most, the uncompressed
+ page frame of a compressed block may be discarded or created
+ (copying the block->page to or from a buf_page_t that is
+ dynamically allocated from buf_buddy_alloc()). Because those
+ transitions hold block->mutex and the flush list mutex (via
+ buf_flush_relocate_on_flush_list()), there is no possibility
+ of a race condition in the assertions below. */
ut_ad(block->page.in_LRU_list);
ut_ad(block->page.in_page_hash);
+ /* buf_buddy_block_register() will take a block in the
+ BUF_BLOCK_MEMORY state, not a file page. */
ut_ad(!block->page.in_zip_hash);
+
ut_ad(!block->page.in_flush_list);
ut_d(block->page.in_flush_list = TRUE);
+ block->page.oldest_modification = lsn;
prev_b = NULL;
- b = UT_LIST_GET_FIRST(buf_pool->flush_list);
- while (b && b->oldest_modification > block->page.oldest_modification) {
- ut_ad(b->in_flush_list);
- prev_b = b;
- b = UT_LIST_GET_NEXT(list, b);
+ /* For the most part when this function is called the flush_rbt
+ should not be NULL. In a very rare boundary case it is possible
+ that the flush_rbt has already been freed by the recovery thread
+ before the last page was hooked up in the flush_list by the
+ io-handler thread. In that case we'll just do a simple
+ linear search in the else block. */
+ if (buf_pool->flush_rbt) {
+
+ prev_b = buf_flush_insert_in_flush_rbt(&block->page);
+
+ } else {
+
+ b = UT_LIST_GET_FIRST(buf_pool->flush_list);
+
+ while (b && b->oldest_modification
+ > block->page.oldest_modification) {
+ ut_ad(b->in_flush_list);
+ prev_b = b;
+ b = UT_LIST_GET_NEXT(list, b);
+ }
}
if (prev_b == NULL) {
@@ -154,6 +334,8 @@ buf_flush_insert_sorted_into_flush_list(
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
ut_a(buf_flush_validate_low());
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+ buf_flush_list_mutex_exit();
}
/********************************************************************//**
@@ -237,7 +419,8 @@ buf_flush_remove(
ut_ad(buf_pool_mutex_own());
ut_ad(mutex_own(buf_page_get_mutex(bpage)));
ut_ad(bpage->in_flush_list);
- ut_d(bpage->in_flush_list = FALSE);
+
+ buf_flush_list_mutex_enter();
switch (buf_page_get_state(bpage)) {
case BUF_BLOCK_ZIP_PAGE:
@@ -259,10 +442,97 @@ buf_flush_remove(
break;
}
+ /* If the flush_rbt is active then delete from it as well. */
+ if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
+ buf_flush_delete_from_flush_rbt(bpage);
+ }
+
+ /* Must be done after we have removed it from the flush_rbt
+ because we assert on in_flush_list in comparison function. */
+ ut_d(bpage->in_flush_list = FALSE);
+
bpage->oldest_modification = 0;
- ut_d(UT_LIST_VALIDATE(list, buf_page_t, buf_pool->flush_list,
- ut_ad(ut_list_node_313->in_flush_list)));
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+ ut_a(buf_flush_validate_low());
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+ buf_flush_list_mutex_exit();
+}
+
+/*******************************************************************//**
+Relocates a buffer control block on the flush_list.
+Note that it is assumed that the contents of bpage has already been
+copied to dpage.
+IMPORTANT: When this function is called bpage and dpage are not
+exact copy of each other. For example, they both will have different
+::state. Also the ::list pointers in dpage may be stale. We need to
+use the current list node (bpage) to do the list manipulation because
+the list pointers could have changed between the time that we copied
+the contents of bpage to the dpage and the flush list manipulation
+below. */
+UNIV_INTERN
+void
+buf_flush_relocate_on_flush_list(
+/*=============================*/
+ buf_page_t* bpage, /*!< in/out: control block being moved */
+ buf_page_t* dpage) /*!< in/out: destination block */
+{
+ buf_page_t* prev;
+ buf_page_t* prev_b = NULL;
+
+ ut_ad(buf_pool_mutex_own());
+
+ ut_ad(mutex_own(buf_page_get_mutex(bpage)));
+
+ buf_flush_list_mutex_enter();
+
+ /* FIXME: At this point we have both buf_pool and flush_list
+ mutexes. Theoratically removal of a block from flush list is
+ only covered by flush_list mutex but currently we do
+ have buf_pool mutex in buf_flush_remove() therefore this block
+ is guaranteed to be in the flush list. We need to check if
+ this will work without the assumption of block removing code
+ having the buf_pool mutex. */
+ ut_ad(bpage->in_flush_list);
+ ut_ad(dpage->in_flush_list);
+
+ /* If recovery is active we must swap the control blocks in
+ the flush_rbt as well. */
+ if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
+ buf_flush_delete_from_flush_rbt(bpage);
+ prev_b = buf_flush_insert_in_flush_rbt(dpage);
+ }
+
+ /* Must be done after we have removed it from the flush_rbt
+ because we assert on in_flush_list in comparison function. */
+ ut_d(bpage->in_flush_list = FALSE);
+
+ prev = UT_LIST_GET_PREV(list, bpage);
+ UT_LIST_REMOVE(list, buf_pool->flush_list, bpage);
+
+ if (prev) {
+ ut_ad(prev->in_flush_list);
+ UT_LIST_INSERT_AFTER(
+ list,
+ buf_pool->flush_list,
+ prev, dpage);
+ } else {
+ UT_LIST_ADD_FIRST(
+ list,
+ buf_pool->flush_list,
+ dpage);
+ }
+
+ /* Just an extra check. Previous in flush_list
+ should be the same control block as in flush_rbt. */
+ ut_a(!buf_pool->flush_rbt || prev_b == prev);
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+ ut_a(buf_flush_validate_low());
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+ buf_flush_list_mutex_exit();
}
/********************************************************************//**
@@ -723,6 +993,7 @@ buf_flush_write_block_low(
relocated in the buffer pool or removed from flush_list or
LRU_list. */
ut_ad(!buf_pool_mutex_own());
+ ut_ad(!buf_flush_list_mutex_own());
ut_ad(!mutex_own(buf_page_get_mutex(bpage)));
ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_WRITE);
ut_ad(bpage->oldest_modification != 0);
@@ -918,17 +1189,19 @@ buf_flush_try_neighbors(
ulint count = 0;
ulint i;
- ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
+ ut_ad(flush_type == BUF_FLUSH_LRU
+ || flush_type == BUF_FLUSH_LIST);
if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN) {
- /* If there is little space, it is better not to flush any
- block except from the end of the LRU list */
+ /* If there is little space, it is better not to flush
+ any block except from the end of the LRU list */
low = offset;
high = offset + 1;
} else {
- /* When flushed, dirty blocks are searched in neighborhoods of
- this size, and flushed along with the original page. */
+ /* When flushed, dirty blocks are searched in
+ neighborhoods of this size, and flushed along with the
+ original page. */
ulint buf_flush_area = ut_min(BUF_READ_AHEAD_AREA,
buf_pool->curr_size / 16);
@@ -969,11 +1242,12 @@ buf_flush_try_neighbors(
if (buf_flush_ready_for_flush(bpage, flush_type)
&& (i == offset || !bpage->buf_fix_count)) {
/* We only try to flush those
- neighbors != offset where the buf fix count is
- zero, as we then know that we probably can
- latch the page without a semaphore wait.
- Semaphore waits are expensive because we must
- flush the doublewrite buffer before we start
+ neighbors != offset where the buf fix
+ count is zero, as we then know that we
+ probably can latch the page without a
+ semaphore wait. Semaphore waits are
+ expensive because we must flush the
+ doublewrite buffer before we start
waiting. */
buf_flush_page(bpage, flush_type);
@@ -992,6 +1266,206 @@ buf_flush_try_neighbors(
return(count);
}
+/********************************************************************//**
+Check if the block is modified and ready for flushing. If the the block
+is ready to flush then flush the page and try o flush its neighbors.
+
+@return TRUE if buf_pool mutex was not released during this function.
+This does not guarantee that some pages were written as well.
+Number of pages written are incremented to the count. */
+static
+ibool
+buf_flush_page_and_try_neighbors(
+/*=============================*/
+ buf_page_t* bpage, /*!< in: buffer control block,
+ must be
+ buf_page_in_file(bpage) */
+ enum buf_flush flush_type, /*!< in: BUF_FLUSH_LRU
+ or BUF_FLUSH_LIST */
+ ulint* count) /*!< in/out: number of pages
+ flushed */
+{
+ ibool flushed = FALSE;
+ mutex_t* block_mutex;
+
+ ut_ad(buf_pool_mutex_own());
+
+ block_mutex = buf_page_get_mutex(bpage);
+ mutex_enter(block_mutex);
+
+ ut_a(buf_page_in_file(bpage));
+
+ if (buf_flush_ready_for_flush(bpage, flush_type)) {
+ ulint space;
+ ulint offset;
+
+ buf_pool_mutex_exit();
+
+ /* These fields are protected by both the
+ buffer pool mutex and block mutex. */
+ space = buf_page_get_space(bpage);
+ offset = buf_page_get_page_no(bpage);
+
+ mutex_exit(block_mutex);
+
+ /* Try to flush also all the neighbors */
+ *count += buf_flush_try_neighbors(space, offset,
+ flush_type);
+
+ buf_pool_mutex_enter();
+ flushed = TRUE;
+ } else {
+ mutex_exit(block_mutex);
+ }
+
+ ut_ad(buf_pool_mutex_own());
+
+ return(flushed);
+}
+
+/*******************************************************************//**
+This utility flushes dirty blocks from the end of the LRU list.
+In the case of an LRU flush the calling thread may own latches to
+pages: to avoid deadlocks, this function must be written so that it
+cannot end up waiting for these latches!
+@return number of blocks for which the write request was queued. */
+static
+ulint
+buf_flush_LRU_list_batch(
+/*=====================*/
+ ulint max) /*!< in: max of blocks to flush */
+{
+ buf_page_t* bpage;
+ ulint count = 0;
+
+ ut_ad(buf_pool_mutex_own());
+
+ do {
+ /* Start from the end of the list looking for a
+ suitable block to be flushed. */
+ bpage = UT_LIST_GET_LAST(buf_pool->LRU);
+
+ /* Iterate backwards over the flush list till we find
+ a page that isn't ready for flushing. */
+ while (bpage != NULL
+ && !buf_flush_page_and_try_neighbors(
+ bpage, BUF_FLUSH_LRU, &count)) {
+
+ bpage = UT_LIST_GET_PREV(LRU, bpage);
+ }
+ } while (bpage != NULL && count < max);
+
+ /* We keep track of all flushes happening as part of LRU
+ flush. When estimating the desired rate at which flush_list
+ should be flushed, we factor in this value. */
+ buf_lru_flush_page_count += count;
+
+ ut_ad(buf_pool_mutex_own());
+
+ return(count);
+}
+
+/*******************************************************************//**
+This utility flushes dirty blocks from the end of the flush_list.
+the calling thread is not allowed to own any latches on pages!
+@return number of blocks for which the write request was queued;
+ULINT_UNDEFINED if there was a flush of the same type already
+running */
+static
+ulint
+buf_flush_flush_list_batch(
+/*=======================*/
+ ulint min_n, /*!< in: wished minimum mumber
+ of blocks flushed (it is not
+ guaranteed that the actual
+ number is that big, though) */
+ ib_uint64_t lsn_limit) /*!< all blocks whose
+ oldest_modification is smaller
+ than this should be flushed (if
+ their number does not exceed
+ min_n) */
+{
+ ulint len;
+ buf_page_t* bpage;
+ ulint count = 0;
+
+ ut_ad(buf_pool_mutex_own());
+
+ /* If we have flushed enough, leave the loop */
+ do {
+ /* Start from the end of the list looking for a suitable
+ block to be flushed. */
+
+ buf_flush_list_mutex_enter();
+
+ /* We use len here because theoratically insertions can
+ happen in the flush_list below while we are traversing
+ it for a suitable candidate for flushing. We'd like to
+ set a limit on how farther we are willing to traverse
+ the list. */
+ len = UT_LIST_GET_LEN(buf_pool->flush_list);
+ bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
+
+ if (bpage) {
+ ut_a(bpage->oldest_modification > 0);
+ }
+
+
+ if (!bpage || bpage->oldest_modification >= lsn_limit) {
+
+ /* We have flushed enough */
+ buf_flush_list_mutex_exit();
+ break;
+ }
+
+ ut_a(bpage->oldest_modification > 0);
+
+ ut_ad(bpage->in_flush_list);
+
+ buf_flush_list_mutex_exit();
+
+ /* The list may change during the flushing and we cannot
+ safely preserve within this function a pointer to a
+ block in the list! */
+ while (bpage != NULL
+ && len > 0
+ && !buf_flush_page_and_try_neighbors(
+ bpage, BUF_FLUSH_LIST, &count)) {
+
+ buf_flush_list_mutex_enter();
+
+ /* If we are here that means that buf_pool
+ mutex was not released in
+ buf_flush_page_and_try_neighbors() above and
+ this guarantees that bpage didn't get
+ relocated since we released the flush_list
+ mutex above. There is a chance, however, that
+ the bpage got removed from flush_list (not
+ currently possible because flush_list_remove()
+ also obtains buf_pool mutex but that may change
+ in future). To avoid this scenario we check
+ the oldest_modification and if it is zero
+ we start all over again. */
+ if (bpage->oldest_modification == 0) {
+ buf_flush_list_mutex_exit();
+ break;
+ }
+ bpage = UT_LIST_GET_PREV(list, bpage);
+
+ ut_ad(!bpage || bpage->in_flush_list);
+
+ buf_flush_list_mutex_exit();
+
+ --len;
+ }
+
+ } while (count < min_n && bpage != NULL && len > 0);
+
+ ut_ad(buf_pool_mutex_own());
+
+ return(count);
+}
+
/*******************************************************************//**
This utility flushes dirty blocks from the end of the LRU list or flush_list.
NOTE 1: in the case of an LRU flush the calling thread may own latches to
@@ -1017,22 +1491,18 @@ buf_flush_batch(
(if their number does not exceed
min_n), otherwise ignored */
{
- buf_page_t* bpage;
- ulint page_count = 0;
- ulint old_page_count;
- ulint space;
- ulint offset;
+ ulint count = 0;
- ut_ad((flush_type == BUF_FLUSH_LRU)
- || (flush_type == BUF_FLUSH_LIST));
+ ut_ad(flush_type == BUF_FLUSH_LRU
+ || flush_type == BUF_FLUSH_LIST);
#ifdef UNIV_SYNC_DEBUG
ut_ad((flush_type != BUF_FLUSH_LIST)
|| sync_thread_levels_empty_gen(TRUE));
#endif /* UNIV_SYNC_DEBUG */
buf_pool_mutex_enter();
- if ((buf_pool->n_flush[flush_type] > 0)
- || (buf_pool->init_flush[flush_type] == TRUE)) {
+ if (buf_pool->n_flush[flush_type] > 0
+ || buf_pool->init_flush[flush_type] == TRUE) {
/* There is already a flush batch of the same type running */
@@ -1043,82 +1513,21 @@ buf_flush_batch(
buf_pool->init_flush[flush_type] = TRUE;
- for (;;) {
-flush_next:
- /* If we have flushed enough, leave the loop */
- if (page_count >= min_n) {
-
- break;
- }
-
- /* Start from the end of the list looking for a suitable
- block to be flushed. */
-
- if (flush_type == BUF_FLUSH_LRU) {
- bpage = UT_LIST_GET_LAST(buf_pool->LRU);
- } else {
- ut_ad(flush_type == BUF_FLUSH_LIST);
-
- bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
- if (!bpage
- || bpage->oldest_modification >= lsn_limit) {
- /* We have flushed enough */
-
- break;
- }
- ut_ad(bpage->in_flush_list);
- }
-
- /* Note that after finding a single flushable page, we try to
- flush also all its neighbors, and after that start from the
- END of the LRU list or flush list again: the list may change
- during the flushing and we cannot safely preserve within this
- function a pointer to a block in the list! */
-
- do {
- mutex_t*block_mutex = buf_page_get_mutex(bpage);
- ibool ready;
-
- ut_a(buf_page_in_file(bpage));
-
- mutex_enter(block_mutex);
- ready = buf_flush_ready_for_flush(bpage, flush_type);
- mutex_exit(block_mutex);
-
- if (ready) {
- space = buf_page_get_space(bpage);
- offset = buf_page_get_page_no(bpage);
-
- buf_pool_mutex_exit();
-
- old_page_count = page_count;
-
- /* Try to flush also all the neighbors */
- page_count += buf_flush_try_neighbors(
- space, offset, flush_type);
- /* fprintf(stderr,
- "Flush type %lu, page no %lu, neighb %lu\n",
- flush_type, offset,
- page_count - old_page_count); */
-
- buf_pool_mutex_enter();
- goto flush_next;
-
- } else if (flush_type == BUF_FLUSH_LRU) {
- bpage = UT_LIST_GET_PREV(LRU, bpage);
- } else {
- ut_ad(flush_type == BUF_FLUSH_LIST);
-
- bpage = UT_LIST_GET_PREV(list, bpage);
- ut_ad(!bpage || bpage->in_flush_list);
- }
- } while (bpage != NULL);
-
- /* If we could not find anything to flush, leave the loop */
-
+ /* Note: The buffer pool mutex is released and reacquired within
+ the flush functions. */
+ switch(flush_type) {
+ case BUF_FLUSH_LRU:
+ count = buf_flush_LRU_list_batch(min_n);
+ break;
+ case BUF_FLUSH_LIST:
+ count = buf_flush_flush_list_batch(min_n, lsn_limit);
break;
+ default:
+ ut_error;
}
+ ut_ad(buf_pool_mutex_own());
+
buf_pool->init_flush[flush_type] = FALSE;
if (buf_pool->n_flush[flush_type] == 0) {
@@ -1133,26 +1542,17 @@ flush_next:
buf_flush_buffered_writes();
#ifdef UNIV_DEBUG
- if (buf_debug_prints && page_count > 0) {
- ut_a(flush_type == BUF_FLUSH_LRU
- || flush_type == BUF_FLUSH_LIST);
+ if (buf_debug_prints && count > 0) {
fprintf(stderr, flush_type == BUF_FLUSH_LRU
? "Flushed %lu pages in LRU flush\n"
: "Flushed %lu pages in flush list flush\n",
- (ulong) page_count);
+ (ulong) count);
}
#endif /* UNIV_DEBUG */
- srv_buf_pool_flushed += page_count;
-
- /* We keep track of all flushes happening as part of LRU
- flush. When estimating the desired rate at which flush_list
- should be flushed we factor in this value. */
- if (flush_type == BUF_FLUSH_LRU) {
- buf_lru_flush_page_count += page_count;
- }
+ srv_buf_pool_flushed += count;
- return(page_count);
+ return(count);
}
/******************************************************************//**
@@ -1367,24 +1767,56 @@ ibool
buf_flush_validate_low(void)
/*========================*/
{
- buf_page_t* bpage;
+ buf_page_t* bpage;
+ const ib_rbt_node_t* rnode = NULL;
+
+ ut_ad(buf_flush_list_mutex_own());
UT_LIST_VALIDATE(list, buf_page_t, buf_pool->flush_list,
ut_ad(ut_list_node_313->in_flush_list));
bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
+ /* If we are in recovery mode i.e.: flush_rbt != NULL
+ then each block in the flush_list must also be present
+ in the flush_rbt. */
+ if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
+ rnode = rbt_first(buf_pool->flush_rbt);
+ }
+
while (bpage != NULL) {
const ib_uint64_t om = bpage->oldest_modification;
ut_ad(bpage->in_flush_list);
- ut_a(buf_page_in_file(bpage));
+
+ /* A page in flush_list can be in BUF_BLOCK_REMOVE_HASH
+ state. This happens when a page is in the middle of
+ being relocated. In that case the original descriptor
+ can have this state and still be in the flush list
+ waiting to acquire the flush_list_mutex to complete
+ the relocation. */
+ ut_a(buf_page_in_file(bpage)
+ || buf_page_get_state(bpage)
+ == BUF_BLOCK_REMOVE_HASH);
ut_a(om > 0);
+ if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
+ ut_a(rnode);
+ buf_page_t* rpage = *rbt_value(buf_page_t*,
+ rnode);
+ ut_a(rpage);
+ ut_a(rpage == bpage);
+ rnode = rbt_next(buf_pool->flush_rbt, rnode);
+ }
+
bpage = UT_LIST_GET_NEXT(list, bpage);
ut_a(!bpage || om >= bpage->oldest_modification);
}
+ /* By this time we must have exhausted the traversal of
+ flush_rbt (if active) as well. */
+ ut_a(rnode == NULL);
+
return(TRUE);
}
@@ -1398,11 +1830,11 @@ buf_flush_validate(void)
{
ibool ret;
- buf_pool_mutex_enter();
+ buf_flush_list_mutex_enter();
ret = buf_flush_validate_low();
- buf_pool_mutex_exit();
+ buf_flush_list_mutex_exit();
return(ret);
}
diff --git a/storage/innobase/buf/buf0lru.c b/storage/innobase/buf/buf0lru.c
index 4f19fd13fa5..c7feb3ae79b 100644
--- a/storage/innobase/buf/buf0lru.c
+++ b/storage/innobase/buf/buf0lru.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -350,17 +350,31 @@ scan_again:
bpage = UT_LIST_GET_LAST(buf_pool->LRU);
while (bpage != NULL) {
- mutex_t* block_mutex = buf_page_get_mutex(bpage);
buf_page_t* prev_bpage;
+ ibool prev_bpage_buf_fix = FALSE;
ut_a(buf_page_in_file(bpage));
- mutex_enter(block_mutex);
prev_bpage = UT_LIST_GET_PREV(LRU, bpage);
- if (buf_page_get_space(bpage) == id) {
- if (bpage->buf_fix_count > 0
- || buf_page_get_io_fix(bpage) != BUF_IO_NONE) {
+ /* bpage->space and bpage->io_fix are protected by
+ buf_pool_mutex and block_mutex. It is safe to check
+ them while holding buf_pool_mutex only. */
+
+ if (buf_page_get_space(bpage) != id) {
+ /* Skip this block, as it does not belong to
+ the space that is being invalidated. */
+ } else if (buf_page_get_io_fix(bpage) != BUF_IO_NONE) {
+ /* We cannot remove this page during this scan
+ yet; maybe the system is currently reading it
+ in, or flushing the modifications to the file */
+
+ all_freed = FALSE;
+ } else {
+ mutex_t* block_mutex = buf_page_get_mutex(bpage);
+ mutex_enter(block_mutex);
+
+ if (bpage->buf_fix_count > 0) {
/* We cannot remove this page during
this scan yet; maybe the system is
@@ -380,8 +394,40 @@ scan_again:
(ulong) buf_page_get_page_no(bpage));
}
#endif
- if (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE
- && ((buf_block_t*) bpage)->is_hashed) {
+ if (buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE) {
+ /* This is a compressed-only block
+ descriptor. Ensure that prev_bpage
+ cannot be relocated when bpage is freed. */
+ if (UNIV_LIKELY(prev_bpage != NULL)) {
+ switch (buf_page_get_state(
+ prev_bpage)) {
+ case BUF_BLOCK_FILE_PAGE:
+ /* Descriptors of uncompressed
+ blocks will not be relocated,
+ because we are holding the
+ buf_pool_mutex. */
+ break;
+ case BUF_BLOCK_ZIP_PAGE:
+ case BUF_BLOCK_ZIP_DIRTY:
+ /* Descriptors of compressed-
+ only blocks can be relocated,
+ unless they are buffer-fixed.
+ Because both bpage and
+ prev_bpage are protected by
+ buf_pool_zip_mutex, it is
+ not necessary to acquire
+ further mutexes. */
+ ut_ad(&buf_pool_zip_mutex
+ == block_mutex);
+ ut_ad(mutex_own(block_mutex));
+ prev_bpage_buf_fix = TRUE;
+ prev_bpage->buf_fix_count++;
+ break;
+ default:
+ ut_error;
+ }
+ }
+ } else if (((buf_block_t*) bpage)->is_hashed) {
ulint page_no;
ulint zip_size;
@@ -405,7 +451,8 @@ scan_again:
buf_flush_remove(bpage);
}
- /* Remove from the LRU list */
+ /* Remove from the LRU list. */
+
if (buf_LRU_block_remove_hashed_page(bpage, TRUE)
!= BUF_BLOCK_ZIP_FREE) {
buf_LRU_block_free_hashed_page((buf_block_t*)
@@ -417,18 +464,27 @@ scan_again:
ut_ad(block_mutex == &buf_pool_zip_mutex);
ut_ad(!mutex_own(block_mutex));
- /* The compressed block descriptor
- (bpage) has been deallocated and
- block_mutex released. Also,
- buf_buddy_free() may have relocated
- prev_bpage. Rescan the LRU list. */
+ if (prev_bpage_buf_fix) {
+ /* We temporarily buffer-fixed
+ prev_bpage, so that
+ buf_buddy_free() could not
+ relocate it, in case it was a
+ compressed-only block
+ descriptor. */
+
+ mutex_enter(block_mutex);
+ ut_ad(prev_bpage->buf_fix_count > 0);
+ prev_bpage->buf_fix_count--;
+ mutex_exit(block_mutex);
+ }
- bpage = UT_LIST_GET_LAST(buf_pool->LRU);
- continue;
+ goto next_page_no_mutex;
}
- }
next_page:
- mutex_exit(block_mutex);
+ mutex_exit(block_mutex);
+ }
+
+next_page_no_mutex:
bpage = prev_bpage;
}
@@ -1398,8 +1454,10 @@ alloc:
buf_page_t* prev_b = UT_LIST_GET_PREV(LRU, b);
const ulint fold = buf_page_address_fold(
bpage->space, bpage->offset);
+ buf_page_t* hash_b = buf_page_hash_get_low(
+ bpage->space, bpage->offset, fold);
- ut_a(!buf_page_hash_get(bpage->space, bpage->offset));
+ ut_a(!hash_b);
b->state = b->oldest_modification
? BUF_BLOCK_ZIP_DIRTY
@@ -1474,26 +1532,8 @@ alloc:
if (b->state == BUF_BLOCK_ZIP_PAGE) {
buf_LRU_insert_zip_clean(b);
} else {
- buf_page_t* prev;
-
- ut_ad(b->in_flush_list);
- ut_d(bpage->in_flush_list = FALSE);
-
- prev = UT_LIST_GET_PREV(list, b);
- UT_LIST_REMOVE(list, buf_pool->flush_list, b);
-
- if (prev) {
- ut_ad(prev->in_flush_list);
- UT_LIST_INSERT_AFTER(
- list,
- buf_pool->flush_list,
- prev, b);
- } else {
- UT_LIST_ADD_FIRST(
- list,
- buf_pool->flush_list,
- b);
- }
+ /* Relocate on buf_pool->flush_list. */
+ buf_flush_relocate_on_flush_list(bpage, b);
}
bpage->zip.data = NULL;
@@ -1642,6 +1682,7 @@ buf_LRU_block_remove_hashed_page(
ibool zip) /*!< in: TRUE if should remove also the
compressed page of an uncompressed page */
{
+ ulint fold;
const buf_page_t* hashed_bpage;
ut_ad(bpage);
ut_ad(buf_pool_mutex_own());
@@ -1725,7 +1766,9 @@ buf_LRU_block_remove_hashed_page(
break;
}
- hashed_bpage = buf_page_hash_get(bpage->space, bpage->offset);
+ fold = buf_page_address_fold(bpage->space, bpage->offset);
+ hashed_bpage = buf_page_hash_get_low(bpage->space, bpage->offset,
+ fold);
if (UNIV_UNLIKELY(bpage != hashed_bpage)) {
fprintf(stderr,
@@ -1757,9 +1800,7 @@ buf_LRU_block_remove_hashed_page(
ut_ad(!bpage->in_zip_hash);
ut_ad(bpage->in_page_hash);
ut_d(bpage->in_page_hash = FALSE);
- HASH_DELETE(buf_page_t, hash, buf_pool->page_hash,
- buf_page_address_fold(bpage->space, bpage->offset),
- bpage);
+ HASH_DELETE(buf_page_t, hash, buf_pool->page_hash, fold, bpage);
switch (buf_page_get_state(bpage)) {
case BUF_BLOCK_ZIP_PAGE:
ut_ad(!bpage->in_free_list);
@@ -2036,6 +2077,7 @@ buf_LRU_print(void)
while (bpage != NULL) {
+ mutex_enter(buf_page_get_mutex(bpage));
fprintf(stderr, "BLOCK space %lu page %lu ",
(ulong) buf_page_get_space(bpage),
(ulong) buf_page_get_page_no(bpage));
@@ -2084,6 +2126,7 @@ buf_LRU_print(void)
break;
}
+ mutex_exit(buf_page_get_mutex(bpage));
bpage = UT_LIST_GET_NEXT(LRU, bpage);
}
diff --git a/storage/innobase/buf/buf0rea.c b/storage/innobase/buf/buf0rea.c
index dd98ea17eb5..81f788baac2 100644
--- a/storage/innobase/buf/buf0rea.c
+++ b/storage/innobase/buf/buf0rea.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -608,14 +608,14 @@ buf_read_recv_pages(
while (buf_pool->n_pend_reads >= recv_n_pool_free_frames / 2) {
os_aio_simulated_wake_handler_threads();
- os_thread_sleep(500000);
+ os_thread_sleep(10000);
count++;
- if (count > 100) {
+ if (count > 1000) {
fprintf(stderr,
"InnoDB: Error: InnoDB has waited for"
- " 50 seconds for pending\n"
+ " 10 seconds for pending\n"
"InnoDB: reads to the buffer pool to"
" be finished.\n"
"InnoDB: Number of pending reads %lu,"
diff --git a/storage/innobase/dict/dict0boot.c b/storage/innobase/dict/dict0boot.c
index e55de30481b..70b5bfa99f7 100644
--- a/storage/innobase/dict/dict0boot.c
+++ b/storage/innobase/dict/dict0boot.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -274,6 +274,9 @@ dict_boot(void)
and (TYPE & DICT_TF_FORMAT_MASK) are nonzero and TYPE = table->flags */
dict_mem_table_add_col(table, heap, "TYPE", DATA_INT, 0, 4);
dict_mem_table_add_col(table, heap, "MIX_ID", DATA_BINARY, 0, 0);
+ /* MIX_LEN may contain additional table flags when
+ ROW_FORMAT!=REDUNDANT. Currently, these flags include
+ DICT_TF2_TEMPORARY. */
dict_mem_table_add_col(table, heap, "MIX_LEN", DATA_INT, 0, 4);
dict_mem_table_add_col(table, heap, "CLUSTER_NAME", DATA_BINARY, 0, 0);
dict_mem_table_add_col(table, heap, "SPACE", DATA_INT, 0, 4);
@@ -355,7 +358,7 @@ dict_boot(void)
dict_mem_table_add_col(table, heap, "SPACE", DATA_INT, 0, 4);
dict_mem_table_add_col(table, heap, "PAGE_NO", DATA_INT, 0, 4);
- /* The '+ 2' below comes from the 2 system fields */
+ /* The '+ 2' below comes from the fields DB_TRX_ID, DB_ROLL_PTR */
#if DICT_SYS_INDEXES_PAGE_NO_FIELD != 6 + 2
#error "DICT_SYS_INDEXES_PAGE_NO_FIELD != 6 + 2"
#endif
@@ -365,6 +368,9 @@ dict_boot(void)
#if DICT_SYS_INDEXES_TYPE_FIELD != 4 + 2
#error "DICT_SYS_INDEXES_TYPE_FIELD != 4 + 2"
#endif
+#if DICT_SYS_INDEXES_NAME_FIELD != 1 + 2
+#error "DICT_SYS_INDEXES_NAME_FIELD != 1 + 2"
+#endif
table->id = DICT_INDEXES_ID;
dict_table_add_to_cache(table, heap);
diff --git a/storage/innobase/dict/dict0crea.c b/storage/innobase/dict/dict0crea.c
index 96a9bd8152e..4ba7cd8a48c 100644
--- a/storage/innobase/dict/dict0crea.c
+++ b/storage/innobase/dict/dict0crea.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -51,16 +51,18 @@ static
dtuple_t*
dict_create_sys_tables_tuple(
/*=========================*/
- dict_table_t* table, /*!< in: table */
- mem_heap_t* heap) /*!< in: memory heap from which the memory for
- the built tuple is allocated */
+ const dict_table_t* table, /*!< in: table */
+ mem_heap_t* heap) /*!< in: memory heap from
+ which the memory for the built
+ tuple is allocated */
{
dict_table_t* sys_tables;
dtuple_t* entry;
dfield_t* dfield;
byte* ptr;
- ut_ad(table && heap);
+ ut_ad(table);
+ ut_ad(heap);
sys_tables = dict_sys->sys_tables;
@@ -69,18 +71,18 @@ dict_create_sys_tables_tuple(
dict_table_copy_types(entry, sys_tables);
/* 0: NAME -----------------------------*/
- dfield = dtuple_get_nth_field(entry, 0);
+ dfield = dtuple_get_nth_field(entry, 0/*NAME*/);
dfield_set_data(dfield, table->name, ut_strlen(table->name));
/* 3: ID -------------------------------*/
- dfield = dtuple_get_nth_field(entry, 1);
+ dfield = dtuple_get_nth_field(entry, 1/*ID*/);
ptr = mem_heap_alloc(heap, 8);
mach_write_to_8(ptr, table->id);
dfield_set_data(dfield, ptr, 8);
/* 4: N_COLS ---------------------------*/
- dfield = dtuple_get_nth_field(entry, 2);
+ dfield = dtuple_get_nth_field(entry, 2/*N_COLS*/);
#if DICT_TF_COMPACT != 1
#error
@@ -91,40 +93,41 @@ dict_create_sys_tables_tuple(
| ((table->flags & DICT_TF_COMPACT) << 31));
dfield_set_data(dfield, ptr, 4);
/* 5: TYPE -----------------------------*/
- dfield = dtuple_get_nth_field(entry, 3);
+ dfield = dtuple_get_nth_field(entry, 3/*TYPE*/);
ptr = mem_heap_alloc(heap, 4);
- if (table->flags & ~DICT_TF_COMPACT) {
+ if (table->flags & (~DICT_TF_COMPACT & ~(~0 << DICT_TF_BITS))) {
ut_a(table->flags & DICT_TF_COMPACT);
ut_a(dict_table_get_format(table) >= DICT_TF_FORMAT_ZIP);
ut_a((table->flags & DICT_TF_ZSSIZE_MASK)
<= (DICT_TF_ZSSIZE_MAX << DICT_TF_ZSSIZE_SHIFT));
- ut_a(!(table->flags & (~0 << DICT_TF_BITS)));
- mach_write_to_4(ptr, table->flags);
+ ut_a(!(table->flags & (~0 << DICT_TF2_BITS)));
+ mach_write_to_4(ptr, table->flags & ~(~0 << DICT_TF_BITS));
} else {
mach_write_to_4(ptr, DICT_TABLE_ORDINARY);
}
dfield_set_data(dfield, ptr, 4);
/* 6: MIX_ID (obsolete) ---------------------------*/
- dfield = dtuple_get_nth_field(entry, 4);
+ dfield = dtuple_get_nth_field(entry, 4/*MIX_ID*/);
ptr = mem_heap_zalloc(heap, 8);
dfield_set_data(dfield, ptr, 8);
- /* 7: MIX_LEN (obsolete) --------------------------*/
+ /* 7: MIX_LEN (additional flags) --------------------------*/
- dfield = dtuple_get_nth_field(entry, 5);
+ dfield = dtuple_get_nth_field(entry, 5/*MIX_LEN*/);
- ptr = mem_heap_zalloc(heap, 4);
+ ptr = mem_heap_alloc(heap, 4);
+ mach_write_to_4(ptr, table->flags >> DICT_TF2_SHIFT);
dfield_set_data(dfield, ptr, 4);
/* 8: CLUSTER_NAME ---------------------*/
- dfield = dtuple_get_nth_field(entry, 6);
+ dfield = dtuple_get_nth_field(entry, 6/*CLUSTER_NAME*/);
dfield_set_null(dfield); /* not supported */
/* 9: SPACE ----------------------------*/
- dfield = dtuple_get_nth_field(entry, 7);
+ dfield = dtuple_get_nth_field(entry, 7/*SPACE*/);
ptr = mem_heap_alloc(heap, 4);
mach_write_to_4(ptr, table->space);
@@ -143,19 +146,21 @@ static
dtuple_t*
dict_create_sys_columns_tuple(
/*==========================*/
- dict_table_t* table, /*!< in: table */
- ulint i, /*!< in: column number */
- mem_heap_t* heap) /*!< in: memory heap from which the memory for
- the built tuple is allocated */
+ const dict_table_t* table, /*!< in: table */
+ ulint i, /*!< in: column number */
+ mem_heap_t* heap) /*!< in: memory heap from
+ which the memory for the built
+ tuple is allocated */
{
dict_table_t* sys_columns;
dtuple_t* entry;
const dict_col_t* column;
dfield_t* dfield;
byte* ptr;
- const char* col_name;
+ const char* col_name;
- ut_ad(table && heap);
+ ut_ad(table);
+ ut_ad(heap);
column = dict_table_get_nth_col(table, i);
@@ -166,47 +171,47 @@ dict_create_sys_columns_tuple(
dict_table_copy_types(entry, sys_columns);
/* 0: TABLE_ID -----------------------*/
- dfield = dtuple_get_nth_field(entry, 0);
+ dfield = dtuple_get_nth_field(entry, 0/*TABLE_ID*/);
ptr = mem_heap_alloc(heap, 8);
mach_write_to_8(ptr, table->id);
dfield_set_data(dfield, ptr, 8);
/* 1: POS ----------------------------*/
- dfield = dtuple_get_nth_field(entry, 1);
+ dfield = dtuple_get_nth_field(entry, 1/*POS*/);
ptr = mem_heap_alloc(heap, 4);
mach_write_to_4(ptr, i);
dfield_set_data(dfield, ptr, 4);
/* 4: NAME ---------------------------*/
- dfield = dtuple_get_nth_field(entry, 2);
+ dfield = dtuple_get_nth_field(entry, 2/*NAME*/);
col_name = dict_table_get_col_name(table, i);
dfield_set_data(dfield, col_name, ut_strlen(col_name));
/* 5: MTYPE --------------------------*/
- dfield = dtuple_get_nth_field(entry, 3);
+ dfield = dtuple_get_nth_field(entry, 3/*MTYPE*/);
ptr = mem_heap_alloc(heap, 4);
mach_write_to_4(ptr, column->mtype);
dfield_set_data(dfield, ptr, 4);
/* 6: PRTYPE -------------------------*/
- dfield = dtuple_get_nth_field(entry, 4);
+ dfield = dtuple_get_nth_field(entry, 4/*PRTYPE*/);
ptr = mem_heap_alloc(heap, 4);
mach_write_to_4(ptr, column->prtype);
dfield_set_data(dfield, ptr, 4);
/* 7: LEN ----------------------------*/
- dfield = dtuple_get_nth_field(entry, 5);
+ dfield = dtuple_get_nth_field(entry, 5/*LEN*/);
ptr = mem_heap_alloc(heap, 4);
mach_write_to_4(ptr, column->len);
dfield_set_data(dfield, ptr, 4);
/* 8: PREC ---------------------------*/
- dfield = dtuple_get_nth_field(entry, 6);
+ dfield = dtuple_get_nth_field(entry, 6/*PREC*/);
ptr = mem_heap_alloc(heap, 4);
mach_write_to_4(ptr, 0/* unused */);
@@ -230,6 +235,7 @@ dict_build_table_def_step(
dict_table_t* table;
dtuple_t* row;
ulint error;
+ ulint flags;
const char* path_or_name;
ibool is_path;
mtr_t mtr;
@@ -268,9 +274,10 @@ dict_build_table_def_step(
ut_ad(!dict_table_zip_size(table)
|| dict_table_get_format(table) >= DICT_TF_FORMAT_ZIP);
+ flags = table->flags & ~(~0 << DICT_TF_BITS);
error = fil_create_new_single_table_tablespace(
&space, path_or_name, is_path,
- table->flags == DICT_TF_COMPACT ? 0 : table->flags,
+ flags == DICT_TF_COMPACT ? 0 : flags,
FIL_IBD_FILE_INITIAL_SIZE);
table->space = (unsigned int) space;
@@ -286,7 +293,7 @@ dict_build_table_def_step(
mtr_commit(&mtr);
} else {
/* Create in the system tablespace: disallow new features */
- table->flags &= DICT_TF_COMPACT;
+ table->flags &= (~0 << DICT_TF_BITS) | DICT_TF_COMPACT;
}
row = dict_create_sys_tables_tuple(table, node->heap);
@@ -322,9 +329,10 @@ static
dtuple_t*
dict_create_sys_indexes_tuple(
/*==========================*/
- dict_index_t* index, /*!< in: index */
- mem_heap_t* heap) /*!< in: memory heap from which the memory for
- the built tuple is allocated */
+ const dict_index_t* index, /*!< in: index */
+ mem_heap_t* heap) /*!< in: memory heap from
+ which the memory for the built
+ tuple is allocated */
{
dict_table_t* sys_indexes;
dict_table_t* table;
@@ -333,7 +341,8 @@ dict_create_sys_indexes_tuple(
byte* ptr;
ut_ad(mutex_own(&(dict_sys->mutex)));
- ut_ad(index && heap);
+ ut_ad(index);
+ ut_ad(heap);
sys_indexes = dict_sys->sys_indexes;
@@ -344,32 +353,32 @@ dict_create_sys_indexes_tuple(
dict_table_copy_types(entry, sys_indexes);
/* 0: TABLE_ID -----------------------*/
- dfield = dtuple_get_nth_field(entry, 0);
+ dfield = dtuple_get_nth_field(entry, 0/*TABLE_ID*/);
ptr = mem_heap_alloc(heap, 8);
mach_write_to_8(ptr, table->id);
dfield_set_data(dfield, ptr, 8);
/* 1: ID ----------------------------*/
- dfield = dtuple_get_nth_field(entry, 1);
+ dfield = dtuple_get_nth_field(entry, 1/*ID*/);
ptr = mem_heap_alloc(heap, 8);
mach_write_to_8(ptr, index->id);
dfield_set_data(dfield, ptr, 8);
/* 4: NAME --------------------------*/
- dfield = dtuple_get_nth_field(entry, 2);
+ dfield = dtuple_get_nth_field(entry, 2/*NAME*/);
dfield_set_data(dfield, index->name, ut_strlen(index->name));
/* 5: N_FIELDS ----------------------*/
- dfield = dtuple_get_nth_field(entry, 3);
+ dfield = dtuple_get_nth_field(entry, 3/*N_FIELDS*/);
ptr = mem_heap_alloc(heap, 4);
mach_write_to_4(ptr, index->n_fields);
dfield_set_data(dfield, ptr, 4);
/* 6: TYPE --------------------------*/
- dfield = dtuple_get_nth_field(entry, 4);
+ dfield = dtuple_get_nth_field(entry, 4/*TYPE*/);
ptr = mem_heap_alloc(heap, 4);
mach_write_to_4(ptr, index->type);
@@ -381,7 +390,7 @@ dict_create_sys_indexes_tuple(
#error "DICT_SYS_INDEXES_SPACE_NO_FIELD != 7"
#endif
- dfield = dtuple_get_nth_field(entry, 5);
+ dfield = dtuple_get_nth_field(entry, 5/*SPACE*/);
ptr = mem_heap_alloc(heap, 4);
mach_write_to_4(ptr, index->space);
@@ -393,7 +402,7 @@ dict_create_sys_indexes_tuple(
#error "DICT_SYS_INDEXES_PAGE_NO_FIELD != 8"
#endif
- dfield = dtuple_get_nth_field(entry, 6);
+ dfield = dtuple_get_nth_field(entry, 6/*PAGE_NO*/);
ptr = mem_heap_alloc(heap, 4);
mach_write_to_4(ptr, FIL_NULL);
@@ -412,10 +421,11 @@ static
dtuple_t*
dict_create_sys_fields_tuple(
/*=========================*/
- dict_index_t* index, /*!< in: index */
- ulint i, /*!< in: field number */
- mem_heap_t* heap) /*!< in: memory heap from which the memory for
- the built tuple is allocated */
+ const dict_index_t* index, /*!< in: index */
+ ulint i, /*!< in: field number */
+ mem_heap_t* heap) /*!< in: memory heap from
+ which the memory for the built
+ tuple is allocated */
{
dict_table_t* sys_fields;
dtuple_t* entry;
@@ -425,7 +435,8 @@ dict_create_sys_fields_tuple(
ibool index_contains_column_prefix_field = FALSE;
ulint j;
- ut_ad(index && heap);
+ ut_ad(index);
+ ut_ad(heap);
for (j = 0; j < index->n_fields; j++) {
if (dict_index_get_nth_field(index, j)->prefix_len > 0) {
@@ -443,7 +454,7 @@ dict_create_sys_fields_tuple(
dict_table_copy_types(entry, sys_fields);
/* 0: INDEX_ID -----------------------*/
- dfield = dtuple_get_nth_field(entry, 0);
+ dfield = dtuple_get_nth_field(entry, 0/*INDEX_ID*/);
ptr = mem_heap_alloc(heap, 8);
mach_write_to_8(ptr, index->id);
@@ -451,7 +462,7 @@ dict_create_sys_fields_tuple(
dfield_set_data(dfield, ptr, 8);
/* 1: POS + PREFIX LENGTH ----------------------------*/
- dfield = dtuple_get_nth_field(entry, 1);
+ dfield = dtuple_get_nth_field(entry, 1/*POS*/);
ptr = mem_heap_alloc(heap, 4);
@@ -471,7 +482,7 @@ dict_create_sys_fields_tuple(
dfield_set_data(dfield, ptr, 4);
/* 4: COL_NAME -------------------------*/
- dfield = dtuple_get_nth_field(entry, 2);
+ dfield = dtuple_get_nth_field(entry, 2/*COL_NAME*/);
dfield_set_data(dfield, field->name,
ut_strlen(field->name));
@@ -602,6 +613,7 @@ dict_create_index_tree_step(
dict_table_t* sys_indexes;
dict_table_t* table;
dtuple_t* search_tuple;
+ ulint zip_size;
btr_pcur_t pcur;
mtr_t mtr;
@@ -626,8 +638,9 @@ dict_create_index_tree_step(
btr_pcur_move_to_next_user_rec(&pcur, &mtr);
- node->page_no = btr_create(index->type, index->space,
- dict_table_zip_size(index->table),
+ zip_size = dict_table_zip_size(index->table);
+
+ node->page_no = btr_create(index->type, index->space, zip_size,
index->id, index, &mtr);
/* printf("Created a new index tree in space %lu root page %lu\n",
index->space, index->page_no); */
diff --git a/storage/innobase/dict/dict0dict.c b/storage/innobase/dict/dict0dict.c
index 2e524a5a2e3..a58ca2e7802 100644
--- a/storage/innobase/dict/dict0dict.c
+++ b/storage/innobase/dict/dict0dict.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -70,6 +70,17 @@ we need this; NOTE: a transaction which reserves this must keep book
on the mode in trx_struct::dict_operation_lock_mode */
UNIV_INTERN rw_lock_t dict_operation_lock;
+/* Keys to register rwlocks and mutexes with performance schema */
+#ifdef UNIV_PFS_RWLOCK
+UNIV_INTERN mysql_pfs_key_t dict_operation_lock_key;
+UNIV_INTERN mysql_pfs_key_t index_tree_rw_lock_key;
+#endif /* UNIV_PFS_RWLOCK */
+
+#ifdef UNIV_PFS_MUTEX
+UNIV_INTERN mysql_pfs_key_t dict_sys_mutex_key;
+UNIV_INTERN mysql_pfs_key_t dict_foreign_err_mutex_key;
+#endif /* UNIV_PFS_MUTEX */
+
#define DICT_HEAP_SIZE 100 /*!< initial memory heap size when
creating a table or index object */
#define DICT_POOL_PER_TABLE_HASH 512 /*!< buffer pool max size per table
@@ -140,7 +151,7 @@ static
void
dict_field_print_low(
/*=================*/
- dict_field_t* field); /*!< in: field */
+ const dict_field_t* field); /*!< in: field */
/*********************************************************************//**
Frees a foreign key struct. */
static
@@ -607,7 +618,7 @@ dict_init(void)
{
dict_sys = mem_alloc(sizeof(dict_sys_t));
- mutex_create(&dict_sys->mutex, SYNC_DICT);
+ mutex_create(dict_sys_mutex_key, &dict_sys->mutex, SYNC_DICT);
dict_sys->table_hash = hash_create(buf_pool_get_curr_size()
/ (DICT_POOL_PER_TABLE_HASH
@@ -619,12 +630,14 @@ dict_init(void)
UT_LIST_INIT(dict_sys->table_LRU);
- rw_lock_create(&dict_operation_lock, SYNC_DICT_OPERATION);
+ rw_lock_create(dict_operation_lock_key,
+ &dict_operation_lock, SYNC_DICT_OPERATION);
dict_foreign_err_file = os_file_create_tmpfile();
ut_a(dict_foreign_err_file);
- mutex_create(&dict_foreign_err_mutex, SYNC_ANY_LATCH);
+ mutex_create(dict_foreign_err_mutex_key,
+ &dict_foreign_err_mutex, SYNC_ANY_LATCH);
}
/**********************************************************************//**
@@ -1460,6 +1473,7 @@ dict_index_add_to_cache(
if (!dict_index_find_cols(table, index)) {
+ dict_mem_index_free(index);
return(DB_CORRUPTION);
}
@@ -1566,7 +1580,8 @@ undo_size_ok:
new_index->stat_n_leaf_pages = 1;
new_index->page = page_no;
- rw_lock_create(&new_index->lock, SYNC_INDEX_TREE);
+ rw_lock_create(index_tree_rw_lock_key, &new_index->lock,
+ SYNC_INDEX_TREE);
if (!UNIV_UNLIKELY(new_index->type & DICT_UNIVERSAL)) {
@@ -4402,7 +4417,7 @@ static
void
dict_field_print_low(
/*=================*/
- dict_field_t* field) /*!< in: field */
+ const dict_field_t* field) /*!< in: field */
{
ut_ad(mutex_own(&(dict_sys->mutex)));
@@ -4766,8 +4781,10 @@ UNIV_INTERN
void
dict_table_check_for_dup_indexes(
/*=============================*/
- const dict_table_t* table) /*!< in: Check for dup indexes
+ const dict_table_t* table, /*!< in: Check for dup indexes
in this table */
+ ibool tmp_ok) /*!< in: TRUE=allow temporary
+ index names */
{
/* Check for duplicates, ignoring indexes that are marked
as to be dropped */
@@ -4775,13 +4792,17 @@ dict_table_check_for_dup_indexes(
const dict_index_t* index1;
const dict_index_t* index2;
+ ut_ad(mutex_own(&dict_sys->mutex));
+
/* The primary index _must_ exist */
ut_a(UT_LIST_GET_LEN(table->indexes) > 0);
index1 = UT_LIST_GET_FIRST(table->indexes);
- index2 = UT_LIST_GET_NEXT(indexes, index1);
- while (index1 && index2) {
+ do {
+ ut_ad(tmp_ok || *index1->name != TEMP_INDEX_PREFIX);
+
+ index2 = UT_LIST_GET_NEXT(indexes, index1);
while (index2) {
@@ -4793,8 +4814,7 @@ dict_table_check_for_dup_indexes(
}
index1 = UT_LIST_GET_NEXT(indexes, index1);
- index2 = UT_LIST_GET_NEXT(indexes, index1);
- }
+ } while (index1);
}
#endif /* UNIV_DEBUG */
diff --git a/storage/innobase/dict/dict0load.c b/storage/innobase/dict/dict0load.c
index 842a129c1a6..377818308c5 100644
--- a/storage/innobase/dict/dict0load.c
+++ b/storage/innobase/dict/dict0load.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -260,7 +260,7 @@ dict_sys_tables_get_flags(
return(0);
}
- field = rec_get_nth_field_old(rec, 4, &len);
+ field = rec_get_nth_field_old(rec, 4/*N_COLS*/, &len);
n_cols = mach_read_from_4(field);
if (UNIV_UNLIKELY(!(n_cols & 0x80000000UL))) {
@@ -390,15 +390,35 @@ loop:
mtr_commit(&mtr);
- if (space_id != 0 && in_crash_recovery) {
+ if (space_id == 0) {
+ /* The system tablespace always exists. */
+ } else if (in_crash_recovery) {
/* Check that the tablespace (the .ibd file) really
- exists; print a warning to the .err log if not */
-
- fil_space_for_table_exists_in_mem(space_id, name,
- FALSE, TRUE, TRUE);
- }
+ exists; print a warning to the .err log if not.
+ Do not print warnings for temporary tables. */
+ ibool is_temp;
+
+ field = rec_get_nth_field_old(rec, 4, &len);
+ if (0x80000000UL & mach_read_from_4(field)) {
+ /* ROW_FORMAT=COMPACT: read the is_temp
+ flag from SYS_TABLES.MIX_LEN. */
+ field = rec_get_nth_field_old(rec, 7, &len);
+ is_temp = mach_read_from_4(field)
+ & DICT_TF2_TEMPORARY;
+ } else {
+ /* For tables created with old versions
+ of InnoDB, SYS_TABLES.MIX_LEN may contain
+ garbage. Such tables would always be
+ in ROW_FORMAT=REDUNDANT. Pretend that
+ all such tables are non-temporary. That is,
+ do not suppress error printouts about
+ temporary tables not being found. */
+ is_temp = FALSE;
+ }
- if (space_id != 0 && !in_crash_recovery) {
+ fil_space_for_table_exists_in_mem(
+ space_id, name, is_temp, TRUE, !is_temp);
+ } else {
/* It is a normal database startup: create the space
object and check that the .ibd file exists. */
@@ -894,43 +914,72 @@ err_exit:
(ulong) flags);
goto err_exit;
}
+ } else {
+ flags = 0;
+ }
- if (fil_space_for_table_exists_in_mem(space, name, FALSE,
- FALSE, FALSE)) {
- /* Ok; (if we did a crash recovery then the tablespace
- can already be in the memory cache) */
- } else {
- /* In >= 4.1.9, InnoDB scans the data dictionary also
- at a normal mysqld startup. It is an error if the
- space object does not exist in memory. */
+ ut_a(name_of_col_is(sys_tables, sys_index, 4, "N_COLS"));
+ field = rec_get_nth_field_old(rec, 4, &len);
+ n_cols = mach_read_from_4(field);
+
+ /* The high-order bit of N_COLS is the "compact format" flag.
+ For tables in that format, MIX_LEN may hold additional flags. */
+ if (n_cols & 0x80000000UL) {
+ ulint flags2;
+
+ flags |= DICT_TF_COMPACT;
+
+ ut_a(name_of_col_is(sys_tables, sys_index, 7, "MIX_LEN"));
+ field = rec_get_nth_field_old(rec, 7, &len);
+
+ flags2 = mach_read_from_4(field);
+
+ if (flags2 & (~0 << (DICT_TF2_BITS - DICT_TF2_SHIFT))) {
+ ut_print_timestamp(stderr);
+ fputs(" InnoDB: Warning: table ", stderr);
+ ut_print_filename(stderr, name);
+ fprintf(stderr, "\n"
+ "InnoDB: in InnoDB data dictionary"
+ " has unknown flags %lx.\n",
+ (ulong) flags2);
+
+ flags2 &= ~(~0 << (DICT_TF2_BITS - DICT_TF2_SHIFT));
+ }
+
+ flags |= flags2 << DICT_TF2_SHIFT;
+ }
+
+ /* See if the tablespace is available. */
+ if (space == 0) {
+ /* The system tablespace is always available. */
+ } else if (!fil_space_for_table_exists_in_mem(
+ space, name,
+ (flags >> DICT_TF2_SHIFT) & DICT_TF2_TEMPORARY,
+ FALSE, FALSE)) {
+
+ if ((flags >> DICT_TF2_SHIFT) & DICT_TF2_TEMPORARY) {
+ /* Do not bother to retry opening temporary tables. */
+ ibd_file_missing = TRUE;
+ } else {
ut_print_timestamp(stderr);
fprintf(stderr,
- " InnoDB: error: space object of table %s,\n"
+ " InnoDB: error: space object of table");
+ ut_print_filename(stderr, name);
+ fprintf(stderr, ",\n"
"InnoDB: space id %lu did not exist in memory."
" Retrying an open.\n",
- name, (ulong)space);
+ (ulong) space);
/* Try to open the tablespace */
if (!fil_open_single_table_tablespace(
- TRUE, space, flags, name)) {
- /* We failed to find a sensible tablespace
- file */
+ TRUE, space,
+ flags & ~(~0 << DICT_TF_BITS), name)) {
+ /* We failed to find a sensible
+ tablespace file */
ibd_file_missing = TRUE;
}
}
- } else {
- flags = 0;
- }
-
- ut_a(name_of_col_is(sys_tables, sys_index, 4, "N_COLS"));
-
- field = rec_get_nth_field_old(rec, 4, &len);
- n_cols = mach_read_from_4(field);
-
- /* The high-order bit of N_COLS is the "compact format" flag. */
- if (n_cols & 0x80000000UL) {
- flags |= DICT_TF_COMPACT;
}
table = dict_mem_table_create(name, space, n_cols & ~0x80000000UL,
diff --git a/storage/innobase/dict/dict0mem.c b/storage/innobase/dict/dict0mem.c
index 6458cbab92d..b2f58fbc63f 100644
--- a/storage/innobase/dict/dict0mem.c
+++ b/storage/innobase/dict/dict0mem.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -40,6 +40,11 @@ Created 1/8/1996 Heikki Tuuri
#define DICT_HEAP_SIZE 100 /*!< initial memory heap size when
creating a table or index object */
+#ifdef UNIV_PFS_MUTEX
+/* Key to register autoinc_mutex with performance schema */
+UNIV_INTERN mysql_pfs_key_t autoinc_mutex_key;
+#endif /* UNIV_PFS_MUTEX */
+
/**********************************************************************//**
Creates a table memory object.
@return own: table object */
@@ -59,7 +64,7 @@ dict_mem_table_create(
mem_heap_t* heap;
ut_ad(name);
- ut_a(!(flags & (~0 << DICT_TF_BITS)));
+ ut_a(!(flags & (~0 << DICT_TF2_BITS)));
heap = mem_heap_create(DICT_HEAP_SIZE);
@@ -78,7 +83,8 @@ dict_mem_table_create(
#ifndef UNIV_HOTBACKUP
table->autoinc_lock = mem_heap_alloc(heap, lock_get_size());
- mutex_create(&table->autoinc_mutex, SYNC_DICT_AUTOINC_MUTEX);
+ mutex_create(autoinc_mutex_key,
+ &table->autoinc_mutex, SYNC_DICT_AUTOINC_MUTEX);
table->autoinc = 0;
diff --git a/storage/innobase/fil/fil0fil.c b/storage/innobase/fil/fil0fil.c
index 112a0e27d50..9064710d062 100644
--- a/storage/innobase/fil/fil0fil.c
+++ b/storage/innobase/fil/fil0fil.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -38,6 +38,7 @@ Created 10/25/1995 Heikki Tuuri
#include "mtr0mtr.h"
#include "mtr0log.h"
#include "dict0dict.h"
+#include "page0page.h"
#include "page0zip.h"
#ifndef UNIV_HOTBACKUP
# include "buf0lru.h"
@@ -120,6 +121,16 @@ UNIV_INTERN ulint fil_n_pending_tablespace_flushes = 0;
/** The null file address */
UNIV_INTERN fil_addr_t fil_addr_null = {FIL_NULL, 0};
+#ifdef UNIV_PFS_MUTEX
+/* Key to register fil_system_mutex with performance schema */
+UNIV_INTERN mysql_pfs_key_t fil_system_mutex_key;
+#endif /* UNIV_PFS_MUTEX */
+
+#ifdef UNIV_PFS_RWLOCK
+/* Key to register file space latch with performance schema */
+UNIV_INTERN mysql_pfs_key_t fil_space_latch_key;
+#endif /* UNIV_PFS_RWLOCK */
+
/** File node of a tablespace or the log data space */
struct fil_node_struct {
fil_space_t* space; /*!< backpointer to the space where this node
@@ -648,7 +659,8 @@ fil_node_open_file(
async I/O! */
node->handle = os_file_create_simple_no_error_handling(
- node->name, OS_FILE_OPEN, OS_FILE_READ_ONLY, &success);
+ innodb_file_data_key, node->name, OS_FILE_OPEN,
+ OS_FILE_READ_ONLY, &success);
if (!success) {
/* The following call prints an error message */
os_file_get_last_error(TRUE);
@@ -766,15 +778,21 @@ add_size:
os_file_create() to fall back to the normal file I/O mode. */
if (space->purpose == FIL_LOG) {
- node->handle = os_file_create(node->name, OS_FILE_OPEN,
- OS_FILE_AIO, OS_LOG_FILE, &ret);
+ node->handle = os_file_create(innodb_file_log_key,
+ node->name, OS_FILE_OPEN,
+ OS_FILE_AIO, OS_LOG_FILE,
+ &ret);
} else if (node->is_raw_disk) {
- node->handle = os_file_create(node->name,
+ node->handle = os_file_create(innodb_file_data_key,
+ node->name,
OS_FILE_OPEN_RAW,
- OS_FILE_AIO, OS_DATA_FILE, &ret);
+ OS_FILE_AIO, OS_DATA_FILE,
+ &ret);
} else {
- node->handle = os_file_create(node->name, OS_FILE_OPEN,
- OS_FILE_AIO, OS_DATA_FILE, &ret);
+ node->handle = os_file_create(innodb_file_data_key,
+ node->name, OS_FILE_OPEN,
+ OS_FILE_AIO, OS_DATA_FILE,
+ &ret);
}
ut_a(ret);
@@ -1097,10 +1115,13 @@ fil_space_create(
fil_space_t* space;
/* The tablespace flags (FSP_SPACE_FLAGS) should be 0 for
- ROW_FORMAT=COMPACT (table->flags == DICT_TF_COMPACT) and
+ ROW_FORMAT=COMPACT
+ ((table->flags & ~(~0 << DICT_TF_BITS)) == DICT_TF_COMPACT) and
ROW_FORMAT=REDUNDANT (table->flags == 0). For any other
- format, the tablespace flags should equal table->flags. */
+ format, the tablespace flags should equal
+ (table->flags & ~(~0 << DICT_TF_BITS)). */
ut_a(flags != DICT_TF_COMPACT);
+ ut_a(!(flags & (~0UL << DICT_TF_BITS)));
try_again:
/*printf(
@@ -1208,7 +1229,7 @@ try_again:
UT_LIST_INIT(space->chain);
space->magic_n = FIL_SPACE_MAGIC_N;
- rw_lock_create(&space->latch, SYNC_FSP);
+ rw_lock_create(fil_space_latch_key, &space->latch, SYNC_FSP);
HASH_INSERT(fil_space_t, hash, fil_system->spaces, id, space);
@@ -1510,7 +1531,8 @@ fil_init(
fil_system = mem_alloc(sizeof(fil_system_t));
- mutex_create(&fil_system->mutex, SYNC_ANY_LATCH);
+ mutex_create(fil_system_mutex_key,
+ &fil_system->mutex, SYNC_ANY_LATCH);
fil_system->spaces = hash_create(hash_size);
fil_system->name_hash = hash_create(hash_size);
@@ -2515,7 +2537,7 @@ retry:
success = fil_rename_tablespace_in_mem(space, node, path);
if (success) {
- success = os_file_rename(old_path, path);
+ success = os_file_rename(innodb_file_data_key, old_path, path);
if (!success) {
/* We have to revert the changes we made
@@ -2582,14 +2604,18 @@ fil_create_new_single_table_tablespace(
ut_a(size >= FIL_IBD_FILE_INITIAL_SIZE);
/* The tablespace flags (FSP_SPACE_FLAGS) should be 0 for
- ROW_FORMAT=COMPACT (table->flags == DICT_TF_COMPACT) and
+ ROW_FORMAT=COMPACT
+ ((table->flags & ~(~0 << DICT_TF_BITS)) == DICT_TF_COMPACT) and
ROW_FORMAT=REDUNDANT (table->flags == 0). For any other
- format, the tablespace flags should equal table->flags. */
+ format, the tablespace flags should equal
+ (table->flags & ~(~0 << DICT_TF_BITS)). */
ut_a(flags != DICT_TF_COMPACT);
+ ut_a(!(flags & (~0UL << DICT_TF_BITS)));
path = fil_make_ibd_name(tablename, is_temp);
- file = os_file_create(path, OS_FILE_CREATE, OS_FILE_NORMAL,
+ file = os_file_create(innodb_file_data_key, path,
+ OS_FILE_CREATE, OS_FILE_NORMAL,
OS_DATA_FILE, &ret);
if (ret == FALSE) {
ut_print_timestamp(stderr);
@@ -2786,11 +2812,13 @@ fil_reset_too_high_lsns(
ib_int64_t offset;
ulint zip_size;
ibool success;
+ page_zip_des_t page_zip;
filepath = fil_make_ibd_name(name, FALSE);
file = os_file_create_simple_no_error_handling(
- filepath, OS_FILE_OPEN, OS_FILE_READ_WRITE, &success);
+ innodb_file_data_key, filepath, OS_FILE_OPEN,
+ OS_FILE_READ_WRITE, &success);
if (!success) {
/* The following call prints an error message */
os_file_get_last_error(TRUE);
@@ -2833,6 +2861,12 @@ fil_reset_too_high_lsns(
space_id = fsp_header_get_space_id(page);
zip_size = fsp_header_get_zip_size(page);
+ page_zip_des_init(&page_zip);
+ page_zip_set_size(&page_zip, zip_size);
+ if (zip_size) {
+ page_zip.data = page + UNIV_PAGE_SIZE;
+ }
+
ut_print_timestamp(stderr);
fprintf(stderr,
" InnoDB: Flush lsn in the tablespace file %lu"
@@ -2867,20 +2901,23 @@ fil_reset_too_high_lsns(
/* We have to reset the lsn */
if (zip_size) {
- memcpy(page + UNIV_PAGE_SIZE, page, zip_size);
+ memcpy(page_zip.data, page, zip_size);
buf_flush_init_for_writing(
- page, page + UNIV_PAGE_SIZE,
- current_lsn);
+ page, &page_zip, current_lsn);
+ success = os_file_write(
+ filepath, file, page_zip.data,
+ (ulint) offset & 0xFFFFFFFFUL,
+ (ulint) (offset >> 32), zip_size);
} else {
buf_flush_init_for_writing(
page, NULL, current_lsn);
+ success = os_file_write(
+ filepath, file, page,
+ (ulint)(offset & 0xFFFFFFFFUL),
+ (ulint)(offset >> 32),
+ UNIV_PAGE_SIZE);
}
- success = os_file_write(filepath, file, page,
- (ulint)(offset & 0xFFFFFFFFUL),
- (ulint)(offset >> 32),
- zip_size
- ? zip_size
- : UNIV_PAGE_SIZE);
+
if (!success) {
goto func_exit;
@@ -2956,13 +2993,17 @@ fil_open_single_table_tablespace(
filepath = fil_make_ibd_name(name, FALSE);
/* The tablespace flags (FSP_SPACE_FLAGS) should be 0 for
- ROW_FORMAT=COMPACT (table->flags == DICT_TF_COMPACT) and
+ ROW_FORMAT=COMPACT
+ ((table->flags & ~(~0 << DICT_TF_BITS)) == DICT_TF_COMPACT) and
ROW_FORMAT=REDUNDANT (table->flags == 0). For any other
- format, the tablespace flags should equal table->flags. */
+ format, the tablespace flags should equal
+ (table->flags & ~(~0 << DICT_TF_BITS)). */
ut_a(flags != DICT_TF_COMPACT);
+ ut_a(!(flags & (~0UL << DICT_TF_BITS)));
file = os_file_create_simple_no_error_handling(
- filepath, OS_FILE_OPEN, OS_FILE_READ_ONLY, &success);
+ innodb_file_data_key, filepath, OS_FILE_OPEN,
+ OS_FILE_READ_ONLY, &success);
if (!success) {
/* The following call prints an error message */
os_file_get_last_error(TRUE);
@@ -3011,7 +3052,8 @@ fil_open_single_table_tablespace(
ut_free(buf2);
- if (UNIV_UNLIKELY(space_id != id || space_flags != flags)) {
+ if (UNIV_UNLIKELY(space_id != id
+ || space_flags != (flags & ~(~0 << DICT_TF_BITS)))) {
ut_print_timestamp(stderr);
fputs(" InnoDB: Error: tablespace id and flags in file ",
@@ -3117,7 +3159,8 @@ fil_load_single_table_tablespace(
# endif /* !UNIV_HOTBACKUP */
#endif
file = os_file_create_simple_no_error_handling(
- filepath, OS_FILE_OPEN, OS_FILE_READ_ONLY, &success);
+ innodb_file_data_key, filepath, OS_FILE_OPEN,
+ OS_FILE_READ_ONLY, &success);
if (!success) {
/* The following call prints an error message */
os_file_get_last_error(TRUE);
@@ -3275,7 +3318,7 @@ fil_load_single_table_tablespace(
os_file_close(file);
new_path = fil_make_ibbackup_old_name(filepath);
- ut_a(os_file_rename(filepath, new_path));
+ ut_a(os_file_rename(innodb_file_data_key, filepath, new_path));
ut_free(buf2);
mem_free(filepath);
@@ -3313,7 +3356,7 @@ fil_load_single_table_tablespace(
mutex_exit(&fil_system->mutex);
- ut_a(os_file_rename(filepath, new_path));
+ ut_a(os_file_rename(innodb_file_data_key, filepath, new_path));
ut_free(buf2);
mem_free(filepath);
@@ -4435,11 +4478,14 @@ fil_aio_wait(
ut_ad(fil_validate());
- if (os_aio_use_native_aio) {
+ if (srv_use_native_aio) {
srv_set_io_thread_op_info(segment, "native aio handle");
#ifdef WIN_ASYNC_IO
ret = os_aio_windows_handle(segment, 0, &fil_node,
&message, &type);
+#elif defined(LINUX_NATIVE_AIO)
+ ret = os_aio_linux_handle(segment, &fil_node,
+ &message, &type);
#else
ret = 0; /* Eliminate compiler warning */
ut_error;
@@ -4781,8 +4827,10 @@ void
fil_close(void)
/*===========*/
{
+#ifndef UNIV_HOTBACKUP
/* The mutex should already have been freed. */
ut_ad(fil_system->mutex.magic_n == 0);
+#endif /* !UNIV_HOTBACKUP */
hash_table_free(fil_system->spaces);
diff --git a/storage/innobase/fsp/fsp0fsp.c b/storage/innobase/fsp/fsp0fsp.c
index 3cc4318fc06..c7f1a299d8a 100644
--- a/storage/innobase/fsp/fsp0fsp.c
+++ b/storage/innobase/fsp/fsp0fsp.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -386,11 +386,11 @@ UNIV_INLINE
ibool
xdes_get_bit(
/*=========*/
- xdes_t* descr, /*!< in: descriptor */
- ulint bit, /*!< in: XDES_FREE_BIT or XDES_CLEAN_BIT */
- ulint offset, /*!< in: page offset within extent:
- 0 ... FSP_EXTENT_SIZE - 1 */
- mtr_t* mtr) /*!< in: mtr */
+ const xdes_t* descr, /*!< in: descriptor */
+ ulint bit, /*!< in: XDES_FREE_BIT or XDES_CLEAN_BIT */
+ ulint offset, /*!< in: page offset within extent:
+ 0 ... FSP_EXTENT_SIZE - 1 */
+ mtr_t* mtr) /*!< in: mtr */
{
ulint index;
ulint byte_index;
@@ -527,8 +527,8 @@ UNIV_INLINE
ulint
xdes_get_n_used(
/*============*/
- xdes_t* descr, /*!< in: descriptor */
- mtr_t* mtr) /*!< in: mtr */
+ const xdes_t* descr, /*!< in: descriptor */
+ mtr_t* mtr) /*!< in: mtr */
{
ulint i;
ulint count = 0;
@@ -551,8 +551,8 @@ UNIV_INLINE
ibool
xdes_is_free(
/*=========*/
- xdes_t* descr, /*!< in: descriptor */
- mtr_t* mtr) /*!< in: mtr */
+ const xdes_t* descr, /*!< in: descriptor */
+ mtr_t* mtr) /*!< in: mtr */
{
if (0 == xdes_get_n_used(descr, mtr)) {
@@ -569,8 +569,8 @@ UNIV_INLINE
ibool
xdes_is_full(
/*=========*/
- xdes_t* descr, /*!< in: descriptor */
- mtr_t* mtr) /*!< in: mtr */
+ const xdes_t* descr, /*!< in: descriptor */
+ mtr_t* mtr) /*!< in: mtr */
{
if (FSP_EXTENT_SIZE == xdes_get_n_used(descr, mtr)) {
@@ -586,7 +586,7 @@ UNIV_INLINE
void
xdes_set_state(
/*===========*/
- xdes_t* descr, /*!< in: descriptor */
+ xdes_t* descr, /*!< in/out: descriptor */
ulint state, /*!< in: state to set */
mtr_t* mtr) /*!< in: mtr handle */
{
@@ -605,8 +605,8 @@ UNIV_INLINE
ulint
xdes_get_state(
/*===========*/
- xdes_t* descr, /*!< in: descriptor */
- mtr_t* mtr) /*!< in: mtr handle */
+ const xdes_t* descr, /*!< in: descriptor */
+ mtr_t* mtr) /*!< in: mtr handle */
{
ulint state;
@@ -705,7 +705,7 @@ UNIV_INLINE
xdes_t*
xdes_get_descriptor_with_space_hdr(
/*===============================*/
- fsp_header_t* sp_header,/*!< in: space header, x-latched */
+ fsp_header_t* sp_header,/*!< in/out: space header, x-latched */
ulint space, /*!< in: space id */
ulint offset, /*!< in: page offset;
if equal to the free limit,
@@ -869,9 +869,7 @@ fsp_init_file_page_low(
return;
}
-#ifdef UNIV_BASIC_LOG_DEBUG
- memset(page, 0xff, UNIV_PAGE_SIZE);
-#endif
+ UNIV_MEM_INVALID(page, UNIV_PAGE_SIZE);
mach_write_to_4(page + FIL_PAGE_OFFSET, buf_block_get_page_no(block));
memset(page + FIL_PAGE_LSN, 0, 8);
mach_write_to_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID,
@@ -1342,7 +1340,7 @@ fsp_fill_free_list(
descriptor page and ibuf bitmap page;
then we do not allocate more extents */
ulint space, /*!< in: space */
- fsp_header_t* header, /*!< in: space header */
+ fsp_header_t* header, /*!< in/out: space header */
mtr_t* mtr) /*!< in: mtr */
{
ulint limit;
diff --git a/storage/innobase/ha/ha0ha.c b/storage/innobase/ha/ha0ha.c
index cb5e541b55d..db85288298d 100644
--- a/storage/innobase/ha/ha0ha.c
+++ b/storage/innobase/ha/ha0ha.c
@@ -101,6 +101,8 @@ ha_clear(
ulint i;
ulint n;
+ ut_ad(table);
+ ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
#ifdef UNIV_SYNC_DEBUG
ut_ad(rw_lock_own(&btr_search_latch, RW_LOCK_EXCLUSIVE));
#endif /* UNIV_SYNC_DEBUG */
@@ -146,7 +148,9 @@ ha_insert_for_fold_func(
ha_node_t* prev_node;
ulint hash;
- ut_ad(table && data);
+ ut_ad(data);
+ ut_ad(table);
+ ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
ut_a(block->frame == page_align(data));
#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
@@ -237,6 +241,8 @@ ha_delete_hash_node(
hash_table_t* table, /*!< in: hash table */
ha_node_t* del_node) /*!< in: node to be deleted */
{
+ ut_ad(table);
+ ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
# ifndef UNIV_HOTBACKUP
if (table->adaptive) {
@@ -267,6 +273,8 @@ ha_search_and_update_if_found_func(
{
ha_node_t* node;
+ ut_ad(table);
+ ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
ASSERT_HASH_MUTEX_OWN(table, fold);
#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
ut_a(new_block->frame == page_align(new_data));
@@ -304,6 +312,8 @@ ha_remove_all_nodes_to_page(
{
ha_node_t* node;
+ ut_ad(table);
+ ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
ASSERT_HASH_MUTEX_OWN(table, fold);
node = ha_chain_get_first(table, fold);
@@ -353,6 +363,8 @@ ha_validate(
ibool ok = TRUE;
ulint i;
+ ut_ad(table);
+ ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
ut_a(start_index <= end_index);
ut_a(start_index < hash_get_n_cells(table));
ut_a(end_index < hash_get_n_cells(table));
@@ -391,6 +403,8 @@ ha_print_info(
FILE* file, /*!< in: file where to print */
hash_table_t* table) /*!< in: hash table */
{
+ ut_ad(table);
+ ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
#ifdef UNIV_DEBUG
/* Some of the code here is disabled for performance reasons in production
builds, see http://bugs.mysql.com/36941 */
diff --git a/storage/innobase/ha/hash0hash.c b/storage/innobase/ha/hash0hash.c
index 2800d7793f8..9589da00454 100644
--- a/storage/innobase/ha/hash0hash.c
+++ b/storage/innobase/ha/hash0hash.c
@@ -31,6 +31,11 @@ Created 5/20/1997 Heikki Tuuri
#include "mem0mem.h"
#ifndef UNIV_HOTBACKUP
+
+# ifdef UNIV_PFS_MUTEX
+UNIV_INTERN mysql_pfs_key_t hash_table_mutex_key;
+# endif /* UNIV_PFS_MUTEX */
+
/************************************************************//**
Reserves the mutex for a fold value in a hash table. */
UNIV_INTERN
@@ -119,7 +124,7 @@ hash_create(
table->heaps = NULL;
#endif /* !UNIV_HOTBACKUP */
table->heap = NULL;
- table->magic_n = HASH_TABLE_MAGIC_N;
+ ut_d(table->magic_n = HASH_TABLE_MAGIC_N);
/* Initialize the cell array */
hash_table_clear(table);
@@ -135,6 +140,8 @@ hash_table_free(
/*============*/
hash_table_t* table) /*!< in, own: hash table */
{
+ ut_ad(table);
+ ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
#ifndef UNIV_HOTBACKUP
ut_a(table->mutexes == NULL);
#endif /* !UNIV_HOTBACKUP */
@@ -160,13 +167,16 @@ hash_create_mutexes_func(
{
ulint i;
+ ut_ad(table);
+ ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
ut_a(n_mutexes > 0);
ut_a(ut_is_2pow(n_mutexes));
table->mutexes = mem_alloc(n_mutexes * sizeof(mutex_t));
for (i = 0; i < n_mutexes; i++) {
- mutex_create(table->mutexes + i, sync_level);
+ mutex_create(hash_table_mutex_key,
+ table->mutexes + i, sync_level);
}
table->n_mutexes = n_mutexes;
diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc
index 3debf2805b6..3944cb09767 100644
--- a/storage/innobase/handler/ha_innodb.cc
+++ b/storage/innobase/handler/ha_innodb.cc
@@ -1,7 +1,8 @@
/*****************************************************************************
-Copyright (c) 2000, 2009, MySQL AB & Innobase Oy. All Rights Reserved.
+Copyright (c) 2000, 2010, MySQL AB & Innobase Oy. All Rights Reserved.
Copyright (c) 2008, 2009 Google Inc.
+Copyright (c) 2009, Percona Inc.
Portions of this file contain modifications contributed and copyrighted by
Google, Inc. Those modifications are gratefully acknowledged and are described
@@ -9,6 +10,13 @@ briefly in the InnoDB documentation. The contributions by Google are
incorporated with their permission, and subject to the conditions contained in
the file COPYING.Google.
+Portions of this file contain modifications contributed and copyrighted
+by Percona Inc.. Those modifications are
+gratefully acknowledged and are described briefly in the InnoDB
+documentation. The contributions by Percona Inc. are incorporated with
+their permission, and subject to the conditions contained in the file
+COPYING.Percona.
+
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
Foundation; version 2 of the License.
@@ -22,32 +30,6 @@ this program; if not, write to the Free Software Foundation, Inc., 59 Temple
Place, Suite 330, Boston, MA 02111-1307 USA
*****************************************************************************/
-/***********************************************************************
-
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
-Copyright (c) 2009, Percona Inc.
-
-Portions of this file contain modifications contributed and copyrighted
-by Percona Inc.. Those modifications are
-gratefully acknowledged and are described briefly in the InnoDB
-documentation. The contributions by Percona Inc. are incorporated with
-their permission, and subject to the conditions contained in the file
-COPYING.Percona.
-
-This program is free software; you can redistribute it and/or modify it
-under the terms of the GNU General Public License as published by the
-Free Software Foundation; version 2 of the License.
-
-This program is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
-Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-
-***********************************************************************/
/* TODO list for the InnoDB handler in 5.0:
- Remove the flag trx->active_trans and look at trx->conc_state
@@ -61,14 +43,12 @@ with this program; if not, write to the Free Software Foundation, Inc.,
#pragma implementation // gcc: Class implementation
#endif
-#include <sql_table.h> // explain_filename, nz2, EXPLAIN_PARTITIONS_AS_COMMENT,
- // EXPLAIN_FILENAME_MAX_EXTRA_LENGTH
+#include <mysql_priv.h>
-#include <sql_acl.h> // PROCESS_ACL
#include <m_ctype.h>
#include <mysys_err.h>
#include <mysql/plugin.h>
-#include <mysql/innodb_priv.h>
+#include <mysql/psi/psi.h>
/** @file ha_innodb.cc */
@@ -109,6 +89,7 @@ extern "C" {
#include "ha_innodb.h"
#include "i_s.h"
+#ifndef MYSQL_SERVER
# ifndef MYSQL_PLUGIN_IMPORT
# define MYSQL_PLUGIN_IMPORT /* nothing */
# endif /* MYSQL_PLUGIN_IMPORT */
@@ -118,16 +99,17 @@ extern "C" {
but we need it here */
bool check_global_access(THD *thd, ulong want_access);
#endif /* MYSQL_VERSION_ID < 50124 */
+#endif /* MYSQL_SERVER */
/** to protect innobase_open_files */
-static pthread_mutex_t innobase_share_mutex;
+static mysql_mutex_t innobase_share_mutex;
/** to force correct commit order in binlog */
-static pthread_mutex_t prepare_commit_mutex;
+static mysql_mutex_t prepare_commit_mutex;
static ulong commit_threads = 0;
-static pthread_mutex_t commit_threads_m;
-static pthread_cond_t commit_cond;
-static pthread_mutex_t commit_cond_m;
-static pthread_mutex_t analyze_mutex;
+static mysql_mutex_t commit_threads_m;
+static mysql_cond_t commit_cond;
+static mysql_mutex_t commit_cond_m;
+static mysql_mutex_t analyze_mutex;
static bool innodb_inited = 0;
#define INSIDE_HA_INNOBASE_CC
@@ -211,9 +193,138 @@ bool nw_panic = FALSE;
/** Allowed values of innodb_change_buffering */
static const char* innobase_change_buffering_values[IBUF_USE_COUNT] = {
"none", /* IBUF_USE_NONE */
- "inserts" /* IBUF_USE_INSERT */
+ "inserts", /* IBUF_USE_INSERT */
+ "deletes", /* IBUF_USE_DELETE_MARK */
+ "changes", /* IBUF_USE_INSERT_DELETE_MARK */
+ "purges", /* IBUF_USE_DELETE */
+ "all" /* IBUF_USE_ALL */
};
+#ifdef HAVE_PSI_INTERFACE
+/* Keys to register pthread mutexes/cond in the current file with
+performance schema */
+static mysql_pfs_key_t innobase_share_mutex_key;
+static mysql_pfs_key_t prepare_commit_mutex_key;
+static mysql_pfs_key_t commit_threads_m_key;
+static mysql_pfs_key_t analyze_mutex_key;
+static mysql_pfs_key_t commit_cond_mutex_key;
+static mysql_pfs_key_t commit_cond_key;
+
+static PSI_mutex_info all_pthread_mutexes[] = {
+ {&analyze_mutex_key, "analyze_mutex", 0},
+ {&commit_threads_m_key, "commit_threads_m", 0},
+ {&commit_cond_mutex_key, "commit_cond_mutex", 0},
+ {&innobase_share_mutex_key, "innobase_share_mutex", 0},
+ {&prepare_commit_mutex_key, "prepare_commit_mutex", 0}
+};
+
+static PSI_cond_info all_innodb_conds[] = {
+ {&commit_cond_key, "commit_cond", 0}
+};
+
+# ifdef UNIV_PFS_MUTEX
+/* all_innodb_mutexes array contains mutexes that are
+performance schema instrumented if "UNIV_PFS_MUTEX"
+is defined */
+static PSI_mutex_info all_innodb_mutexes[] = {
+ {&autoinc_mutex_key, "autoinc_mutex", 0},
+ {&btr_search_enabled_mutex_key, "btr_search_enabled_mutex", 0},
+# ifndef PFS_SKIP_BUFFER_MUTEX_RWLOCK
+ {&buffer_block_mutex_key, "buffer_block_mutex", 0},
+# endif /* !PFS_SKIP_BUFFER_MUTEX_RWLOCK */
+ {&buf_pool_mutex_key, "buf_pool_mutex", 0},
+ {&buf_pool_zip_mutex_key, "buf_pool_zip_mutex", 0},
+ {&cache_last_read_mutex_key, "cache_last_read_mutex", 0},
+ {&dict_foreign_err_mutex_key, "dict_foreign_err_mutex", 0},
+ {&dict_sys_mutex_key, "dict_sys_mutex", 0},
+ {&file_format_max_mutex_key, "file_format_max_mutex", 0},
+ {&fil_system_mutex_key, "fil_system_mutex", 0},
+ {&flush_list_mutex_key, "flush_list_mutex", 0},
+ {&flush_order_mutex_key, "flush_order_mutex", 0},
+ {&hash_table_mutex_key, "hash_table_mutex", 0},
+ {&ibuf_bitmap_mutex_key, "ibuf_bitmap_mutex", 0},
+ {&ibuf_mutex_key, "ibuf_mutex", 0},
+ {&ibuf_pessimistic_insert_mutex_key,
+ "ibuf_pessimistic_insert_mutex", 0},
+ {&ios_mutex_key, "ios_mutex", 0},
+ {&kernel_mutex_key, "kernel_mutex", 0},
+ {&log_sys_mutex_key, "log_sys_mutex", 0},
+# ifdef UNIV_MEM_DEBUG
+ {&mem_hash_mutex_key, "mem_hash_mutex", 0},
+# endif /* UNIV_MEM_DEBUG */
+ {&mem_pool_mutex_key, "mem_pool_mutex", 0},
+ {&mutex_list_mutex_key, "mutex_list_mutex", 0},
+ {&purge_sys_mutex_key, "purge_sys_mutex", 0},
+ {&recv_sys_mutex_key, "recv_sys_mutex", 0},
+ {&rseg_mutex_key, "rseg_mutex", 0},
+# ifdef UNIV_SYNC_DEBUG
+ {&rw_lock_debug_mutex_key, "rw_lock_debug_mutex", 0},
+# endif /* UNIV_SYNC_DEBUG */
+ {&rw_lock_list_mutex_key, "rw_lock_list_mutex", 0},
+ {&rw_lock_mutex_key, "rw_lock_mutex", 0},
+ {&srv_dict_tmpfile_mutex_key, "srv_dict_tmpfile_mutex", 0},
+ {&srv_innodb_monitor_mutex_key, "srv_innodb_monitor_mutex", 0},
+ {&srv_misc_tmpfile_mutex_key, "srv_misc_tmpfile_mutex", 0},
+ {&srv_monitor_file_mutex_key, "srv_monitor_file_mutex", 0},
+ {&syn_arr_mutex_key, "syn_arr_mutex", 0},
+# ifdef UNIV_SYNC_DEBUG
+ {&sync_thread_mutex_key, "sync_thread_mutex", 0},
+# endif /* UNIV_SYNC_DEBUG */
+ {&trx_doublewrite_mutex_key, "trx_doublewrite_mutex", 0},
+ {&thr_local_mutex_key, "thr_local_mutex", 0},
+ {&trx_undo_mutex_key, "trx_undo_mutex", 0}
+};
+# endif /* UNIV_PFS_MUTEX */
+
+# ifdef UNIV_PFS_RWLOCK
+/* all_innodb_rwlocks array contains rwlocks that are
+performance schema instrumented if "UNIV_PFS_RWLOCK"
+is defined */
+static PSI_rwlock_info all_innodb_rwlocks[] = {
+# ifdef UNIV_LOG_ARCHIVE
+ {&archive_lock_key, "archive_lock", 0},
+# endif /* UNIV_LOG_ARCHIVE */
+ {&btr_search_latch_key, "btr_search_latch", 0},
+# ifndef PFS_SKIP_BUFFER_MUTEX_RWLOCK
+ {&buf_block_lock_key, "buf_block_lock", 0},
+# endif /* !PFS_SKIP_BUFFER_MUTEX_RWLOCK */
+# ifdef UNIV_SYNC_DEBUG
+ {&buf_block_debug_latch_key, "buf_block_debug_latch", 0},
+# endif /* UNIV_SYNC_DEBUG */
+ {&dict_operation_lock_key, "dict_operation_lock", 0},
+ {&fil_space_latch_key, "fil_space_latch", 0},
+ {&checkpoint_lock_key, "checkpoint_lock", 0},
+ {&trx_i_s_cache_lock_key, "trx_i_s_cache_lock", 0},
+ {&trx_purge_latch_key, "trx_purge_latch", 0},
+ {&index_tree_rw_lock_key, "index_tree_rw_lock", 0}
+};
+# endif /* UNIV_PFS_RWLOCK */
+
+# ifdef UNIV_PFS_THREAD
+/* all_innodb_threads array contains threads that are
+performance schema instrumented if "UNIV_PFS_THREAD"
+is defined */
+static PSI_thread_info all_innodb_threads[] = {
+ {&trx_rollback_clean_thread_key, "trx_rollback_clean_thread", 0},
+ {&io_handler_thread_key, "io_handler_thread", 0},
+ {&srv_lock_timeout_thread_key, "srv_lock_timeout_thread", 0},
+ {&srv_error_monitor_thread_key, "srv_error_monitor_thread", 0},
+ {&srv_monitor_thread_key, "srv_monitor_thread", 0},
+ {&srv_master_thread_key, "srv_master_thread", 0}
+};
+# endif /* UNIV_PFS_THREAD */
+
+# ifdef UNIV_PFS_IO
+/* all_innodb_files array contains the type of files that are
+performance schema instrumented if "UNIV_PFS_IO" is defined */
+static PSI_file_info all_innodb_files[] = {
+ {&innodb_file_data_key, "innodb_data_file", 0},
+ {&innodb_file_log_key, "innodb_log_file", 0},
+ {&innodb_file_temp_key, "innodb_temp_file", 0}
+};
+# endif /* UNIV_PFS_IO */
+#endif /* HAVE_PSI_INTERFACE */
+
static INNOBASE_SHARE *get_share(const char *table_name);
static void free_share(INNOBASE_SHARE *share);
static int innobase_close_connection(handlerton *hton, THD* thd);
@@ -335,7 +446,7 @@ static MYSQL_THDVAR_ULONG(lock_wait_timeout, PLUGIN_VAR_RQCMDARG,
static handler *innobase_create_handler(handlerton *hton,
- TABLE_SHARE *table,
+ TABLE_SHARE *table,
MEM_ROOT *mem_root)
{
return new (mem_root) ha_innobase(hton, table);
@@ -448,8 +559,9 @@ static
int
innobase_start_trx_and_assign_read_view(
/*====================================*/
- handlerton* hton, /*!< in: Innodb handlerton */
- THD* thd); /*!< in: MySQL thread handle of the user for whom
+ /* out: 0 */
+ handlerton* hton, /* in: Innodb handlerton */
+ THD* thd); /* in: MySQL thread handle of the user for whom
the transaction should be committed */
/****************************************************************//**
Flushes InnoDB logs to disk and makes a checkpoint. Really, a commit flushes
@@ -2240,6 +2352,45 @@ innobase_change_buffering_inited_ok:
innobase_commit_concurrency_init_default();
+#ifdef HAVE_PSI_INTERFACE
+ /* Register keys with MySQL performance schema */
+ if (PSI_server) {
+ int count;
+
+ count = array_elements(all_pthread_mutexes);
+ PSI_server->register_mutex("innodb",
+ all_pthread_mutexes, count);
+
+# ifdef UNIV_PFS_MUTEX
+ count = array_elements(all_innodb_mutexes);
+ PSI_server->register_mutex("innodb",
+ all_innodb_mutexes, count);
+# endif /* UNIV_PFS_MUTEX */
+
+# ifdef UNIV_PFS_RWLOCK
+ count = array_elements(all_innodb_rwlocks);
+ PSI_server->register_rwlock("innodb",
+ all_innodb_rwlocks, count);
+# endif /* UNIV_PFS_MUTEX */
+
+# ifdef UNIV_PFS_THREAD
+ count = array_elements(all_innodb_threads);
+ PSI_server->register_thread("innodb",
+ all_innodb_threads, count);
+# endif /* UNIV_PFS_THREAD */
+
+# ifdef UNIV_PFS_IO
+ count = array_elements(all_innodb_files);
+ PSI_server->register_file("innodb",
+ all_innodb_files, count);
+# endif /* UNIV_PFS_IO */
+
+ count = array_elements(all_innodb_conds);
+ PSI_server->register_cond("innodb",
+ all_innodb_conds, count);
+ }
+#endif /* HAVE_PSI_INTERFACE */
+
/* Since we in this module access directly the fields of a trx
struct, and due to different headers and flags it might happen that
mutex_t has a different size in this module and in InnoDB
@@ -2253,12 +2404,18 @@ innobase_change_buffering_inited_ok:
}
innobase_open_tables = hash_create(200);
- pthread_mutex_init(&innobase_share_mutex, MY_MUTEX_INIT_FAST);
- pthread_mutex_init(&prepare_commit_mutex, MY_MUTEX_INIT_FAST);
- pthread_mutex_init(&commit_threads_m, MY_MUTEX_INIT_FAST);
- pthread_mutex_init(&commit_cond_m, MY_MUTEX_INIT_FAST);
- pthread_mutex_init(&analyze_mutex, MY_MUTEX_INIT_FAST);
- pthread_cond_init(&commit_cond, NULL);
+ mysql_mutex_init(innobase_share_mutex_key,
+ &innobase_share_mutex,
+ MY_MUTEX_INIT_FAST);
+ mysql_mutex_init(prepare_commit_mutex_key,
+ &prepare_commit_mutex, MY_MUTEX_INIT_FAST);
+ mysql_mutex_init(commit_threads_m_key,
+ &commit_threads_m, MY_MUTEX_INIT_FAST);
+ mysql_mutex_init(commit_cond_mutex_key,
+ &commit_cond_m, MY_MUTEX_INIT_FAST);
+ mysql_mutex_init(analyze_mutex_key,
+ &analyze_mutex, MY_MUTEX_INIT_FAST);
+ mysql_cond_init(commit_cond_key, &commit_cond, NULL);
innodb_inited= 1;
#ifdef MYSQL_DYNAMIC_PLUGIN
if (innobase_hton != p) {
@@ -2308,12 +2465,12 @@ innobase_end(
srv_free_paths_and_sizes();
my_free(internal_innobase_data_file_path,
MYF(MY_ALLOW_ZERO_PTR));
- pthread_mutex_destroy(&innobase_share_mutex);
- pthread_mutex_destroy(&prepare_commit_mutex);
- pthread_mutex_destroy(&commit_threads_m);
- pthread_mutex_destroy(&commit_cond_m);
- pthread_mutex_destroy(&analyze_mutex);
- pthread_cond_destroy(&commit_cond);
+ mysql_mutex_destroy(&innobase_share_mutex);
+ mysql_mutex_destroy(&prepare_commit_mutex);
+ mysql_mutex_destroy(&commit_threads_m);
+ mysql_mutex_destroy(&commit_cond_m);
+ mysql_mutex_destroy(&analyze_mutex);
+ mysql_cond_destroy(&commit_cond);
}
DBUG_RETURN(err);
@@ -2478,18 +2635,18 @@ innobase_commit(
prepare_commit_mutex */
retry:
if (innobase_commit_concurrency > 0) {
- pthread_mutex_lock(&commit_cond_m);
+ mysql_mutex_lock(&commit_cond_m);
commit_threads++;
if (commit_threads > innobase_commit_concurrency) {
commit_threads--;
- pthread_cond_wait(&commit_cond,
+ mysql_cond_wait(&commit_cond,
&commit_cond_m);
- pthread_mutex_unlock(&commit_cond_m);
+ mysql_mutex_unlock(&commit_cond_m);
goto retry;
}
else {
- pthread_mutex_unlock(&commit_cond_m);
+ mysql_mutex_unlock(&commit_cond_m);
}
}
@@ -2517,15 +2674,15 @@ retry:
trx->flush_log_later = FALSE;
if (innobase_commit_concurrency > 0) {
- pthread_mutex_lock(&commit_cond_m);
+ mysql_mutex_lock(&commit_cond_m);
commit_threads--;
- pthread_cond_signal(&commit_cond);
- pthread_mutex_unlock(&commit_cond_m);
+ mysql_cond_signal(&commit_cond);
+ mysql_mutex_unlock(&commit_cond_m);
}
if (trx->active_trans == 2) {
- pthread_mutex_unlock(&prepare_commit_mutex);
+ mysql_mutex_unlock(&prepare_commit_mutex);
}
/* Now do a write + flush of logs. */
@@ -3022,59 +3179,370 @@ normalize_table_name(
}
/********************************************************************//**
+Get the upper limit of the MySQL integral and floating-point type.
+@return maximum allowed value for the field */
+static
+ulonglong
+innobase_get_int_col_max_value(
+/*===========================*/
+ const Field* field) /*!< in: MySQL field */
+{
+ ulonglong max_value = 0;
+
+ switch(field->key_type()) {
+ /* TINY */
+ case HA_KEYTYPE_BINARY:
+ max_value = 0xFFULL;
+ break;
+ case HA_KEYTYPE_INT8:
+ max_value = 0x7FULL;
+ break;
+ /* SHORT */
+ case HA_KEYTYPE_USHORT_INT:
+ max_value = 0xFFFFULL;
+ break;
+ case HA_KEYTYPE_SHORT_INT:
+ max_value = 0x7FFFULL;
+ break;
+ /* MEDIUM */
+ case HA_KEYTYPE_UINT24:
+ max_value = 0xFFFFFFULL;
+ break;
+ case HA_KEYTYPE_INT24:
+ max_value = 0x7FFFFFULL;
+ break;
+ /* LONG */
+ case HA_KEYTYPE_ULONG_INT:
+ max_value = 0xFFFFFFFFULL;
+ break;
+ case HA_KEYTYPE_LONG_INT:
+ max_value = 0x7FFFFFFFULL;
+ break;
+ /* BIG */
+ case HA_KEYTYPE_ULONGLONG:
+ max_value = 0xFFFFFFFFFFFFFFFFULL;
+ break;
+ case HA_KEYTYPE_LONGLONG:
+ max_value = 0x7FFFFFFFFFFFFFFFULL;
+ break;
+ case HA_KEYTYPE_FLOAT:
+ /* We use the maximum as per IEEE754-2008 standard, 2^24 */
+ max_value = 0x1000000ULL;
+ break;
+ case HA_KEYTYPE_DOUBLE:
+ /* We use the maximum as per IEEE754-2008 standard, 2^53 */
+ max_value = 0x20000000000000ULL;
+ break;
+ default:
+ ut_error;
+ }
+
+ return(max_value);
+}
+
+/*******************************************************************//**
+This function checks whether the index column information
+is consistent between KEY info from mysql and that from innodb index.
+@return TRUE if all column types match. */
+static
+ibool
+innobase_match_index_columns(
+/*=========================*/
+ const KEY* key_info, /*!< in: Index info
+ from mysql */
+ const dict_index_t* index_info) /*!< in: Index info
+ from Innodb */
+{
+ const KEY_PART_INFO* key_part;
+ const KEY_PART_INFO* key_end;
+ const dict_field_t* innodb_idx_fld;
+ const dict_field_t* innodb_idx_fld_end;
+
+ DBUG_ENTER("innobase_match_index_columns");
+
+ /* Check whether user defined index column count matches */
+ if (key_info->key_parts != index_info->n_user_defined_cols) {
+ DBUG_RETURN(FALSE);
+ }
+
+ key_part = key_info->key_part;
+ key_end = key_part + key_info->key_parts;
+ innodb_idx_fld = index_info->fields;
+ innodb_idx_fld_end = index_info->fields + index_info->n_fields;
+
+ /* Check each index column's datatype. We do not check
+ column name because there exists case that index
+ column name got modified in mysql but such change does not
+ propagate to InnoDB.
+ One hidden assumption here is that the index column sequences
+ are matched up between those in mysql and Innodb. */
+ for (; key_part != key_end; ++key_part) {
+ ulint col_type;
+ ibool is_unsigned;
+ ulint mtype = innodb_idx_fld->col->mtype;
+
+ /* Need to translate to InnoDB column type before
+ comparison. */
+ col_type = get_innobase_type_from_mysql_type(&is_unsigned,
+ key_part->field);
+
+ /* Ignore Innodb specific system columns. */
+ while (mtype == DATA_SYS) {
+ innodb_idx_fld++;
+
+ if (innodb_idx_fld >= innodb_idx_fld_end) {
+ DBUG_RETURN(FALSE);
+ }
+ }
+
+ if (col_type != mtype) {
+ /* Column Type mismatches */
+ DBUG_RETURN(FALSE);
+ }
+
+ innodb_idx_fld++;
+ }
+
+ DBUG_RETURN(TRUE);
+}
+
+/*******************************************************************//**
+This function builds a translation table in INNOBASE_SHARE
+structure for fast index location with mysql array number from its
+table->key_info structure. This also provides the necessary translation
+between the key order in mysql key_info and Innodb ib_table->indexes if
+they are not fully matched with each other.
+Note we do not have any mutex protecting the translation table
+building based on the assumption that there is no concurrent
+index creation/drop and DMLs that requires index lookup. All table
+handle will be closed before the index creation/drop.
+@return TRUE if index translation table built successfully */
+static
+ibool
+innobase_build_index_translation(
+/*=============================*/
+ const TABLE* table, /*!< in: table in MySQL data
+ dictionary */
+ dict_table_t* ib_table, /*!< in: table in Innodb data
+ dictionary */
+ INNOBASE_SHARE* share) /*!< in/out: share structure
+ where index translation table
+ will be constructed in. */
+{
+ ulint mysql_num_index;
+ ulint ib_num_index;
+ dict_index_t** index_mapping;
+ ibool ret = TRUE;
+
+ DBUG_ENTER("innobase_build_index_translation");
+
+ mysql_num_index = table->s->keys;
+ ib_num_index = UT_LIST_GET_LEN(ib_table->indexes);
+
+ index_mapping = share->idx_trans_tbl.index_mapping;
+
+ /* If there exists inconsistency between MySQL and InnoDB dictionary
+ (metadata) information, the number of index defined in MySQL
+ could exceed that in InnoDB, do not build index translation
+ table in such case */
+ if (UNIV_UNLIKELY(ib_num_index < mysql_num_index)) {
+ ret = FALSE;
+ goto func_exit;
+ }
+
+ /* If index entry count is non-zero, nothing has
+ changed since last update, directly return TRUE */
+ if (share->idx_trans_tbl.index_count) {
+ /* Index entry count should still match mysql_num_index */
+ ut_a(share->idx_trans_tbl.index_count == mysql_num_index);
+ goto func_exit;
+ }
+
+ /* The number of index increased, rebuild the mapping table */
+ if (mysql_num_index > share->idx_trans_tbl.array_size) {
+ index_mapping = (dict_index_t**) my_realloc(index_mapping,
+ mysql_num_index *
+ sizeof(*index_mapping),
+ MYF(MY_ALLOW_ZERO_PTR));
+
+ if (!index_mapping) {
+ ret = FALSE;
+ goto func_exit;
+ }
+
+ share->idx_trans_tbl.array_size = mysql_num_index;
+ }
+
+
+ /* For each index in the mysql key_info array, fetch its
+ corresponding InnoDB index pointer into index_mapping
+ array. */
+ for (ulint count = 0; count < mysql_num_index; count++) {
+
+ /* Fetch index pointers into index_mapping according to mysql
+ index sequence */
+ index_mapping[count] = dict_table_get_index_on_name(
+ ib_table, table->key_info[count].name);
+
+ if (!index_mapping[count]) {
+ sql_print_error("Cannot find index %s in InnoDB "
+ "index dictionary.",
+ table->key_info[count].name);
+ ret = FALSE;
+ goto func_exit;
+ }
+
+ /* Double check fetched index has the same
+ column info as those in mysql key_info. */
+ if (!innobase_match_index_columns(&table->key_info[count],
+ index_mapping[count])) {
+ sql_print_error("Found index %s whose column info "
+ "does not match that of MySQL.",
+ table->key_info[count].name);
+ ret = FALSE;
+ goto func_exit;
+ }
+ }
+
+ /* Successfully built the translation table */
+ share->idx_trans_tbl.index_count = mysql_num_index;
+
+func_exit:
+ if (!ret) {
+ /* Build translation table failed. */
+ my_free(index_mapping, MYF(MY_ALLOW_ZERO_PTR));
+
+ share->idx_trans_tbl.array_size = 0;
+ share->idx_trans_tbl.index_count = 0;
+ index_mapping = NULL;
+ }
+
+ share->idx_trans_tbl.index_mapping = index_mapping;
+
+ DBUG_RETURN(ret);
+}
+
+/*******************************************************************//**
+This function uses index translation table to quickly locate the
+requested index structure.
+Note we do not have mutex protection for the index translatoin table
+access, it is based on the assumption that there is no concurrent
+translation table rebuild (fter create/drop index) and DMLs that
+require index lookup.
+@return dict_index_t structure for requested index. NULL if
+fail to locate the index structure. */
+static
+dict_index_t*
+innobase_index_lookup(
+/*==================*/
+ INNOBASE_SHARE* share, /*!< in: share structure for index
+ translation table. */
+ uint keynr) /*!< in: index number for the requested
+ index */
+{
+ if (!share->idx_trans_tbl.index_mapping
+ || keynr >= share->idx_trans_tbl.index_count) {
+ return(NULL);
+ }
+
+ return(share->idx_trans_tbl.index_mapping[keynr]);
+}
+
+/************************************************************************
Set the autoinc column max value. This should only be called once from
-ha_innobase::open(). Therefore there's no need for a covering lock.
-@return DB_SUCCESS or error code */
+ha_innobase::open(). Therefore there's no need for a covering lock. */
UNIV_INTERN
-ulint
+void
ha_innobase::innobase_initialize_autoinc()
/*======================================*/
{
- dict_index_t* index;
ulonglong auto_inc;
- const char* col_name;
- ulint error;
+ const Field* field = table->found_next_number_field;
- col_name = table->found_next_number_field->field_name;
- index = innobase_get_index(table->s->next_number_index);
+ if (field != NULL) {
+ auto_inc = innobase_get_int_col_max_value(field);
+ } else {
+ /* We have no idea what's been passed in to us as the
+ autoinc column. We set it to the 0, effectively disabling
+ updates to the table. */
+ auto_inc = 0;
- /* Execute SELECT MAX(col_name) FROM TABLE; */
- error = row_search_max_autoinc(index, col_name, &auto_inc);
+ ut_print_timestamp(stderr);
+ fprintf(stderr, " InnoDB: Unable to determine the AUTOINC "
+ "column name\n");
+ }
- switch (error) {
- case DB_SUCCESS:
+ if (srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE) {
+ /* If the recovery level is set so high that writes
+ are disabled we force the AUTOINC counter to 0
+ value effectively disabling writes to the table.
+ Secondly, we avoid reading the table in case the read
+ results in failure due to a corrupted table/index.
+
+ We will not return an error to the client, so that the
+ tables can be dumped with minimal hassle. If an error
+ were returned in this case, the first attempt to read
+ the table would fail and subsequent SELECTs would succeed. */
+ auto_inc = 0;
+ } else if (field == NULL) {
+ /* This is a far more serious error, best to avoid
+ opening the table and return failure. */
+ my_error(ER_AUTOINC_READ_FAILED, MYF(0));
+ } else {
+ dict_index_t* index;
+ const char* col_name;
+ ulonglong read_auto_inc;
+ ulint err;
- /* At the this stage we don't know the increment
- or the offset, so use default inrement of 1. */
- ++auto_inc;
- break;
+ update_thd(ha_thd());
- case DB_RECORD_NOT_FOUND:
- ut_print_timestamp(stderr);
- fprintf(stderr, " InnoDB: MySQL and InnoDB data "
- "dictionaries are out of sync.\n"
- "InnoDB: Unable to find the AUTOINC column %s in the "
- "InnoDB table %s.\n"
- "InnoDB: We set the next AUTOINC column value to the "
- "maximum possible value,\n"
- "InnoDB: in effect disabling the AUTOINC next value "
- "generation.\n"
- "InnoDB: You can either set the next AUTOINC value "
- "explicitly using ALTER TABLE\n"
- "InnoDB: or fix the data dictionary by recreating "
- "the table.\n",
- col_name, index->table->name);
-
- auto_inc = 0xFFFFFFFFFFFFFFFFULL;
- break;
+ ut_a(prebuilt->trx == thd_to_trx(user_thd));
- default:
- return(error);
+ col_name = field->field_name;
+ index = innobase_get_index(table->s->next_number_index);
+
+ /* Execute SELECT MAX(col_name) FROM TABLE; */
+ err = row_search_max_autoinc(index, col_name, &read_auto_inc);
+
+ switch (err) {
+ case DB_SUCCESS:
+ /* At the this stage we do not know the increment
+ or the offset, so use a default increment of 1. */
+ auto_inc = read_auto_inc + 1;
+ break;
+
+ case DB_RECORD_NOT_FOUND:
+ ut_print_timestamp(stderr);
+ fprintf(stderr, " InnoDB: MySQL and InnoDB data "
+ "dictionaries are out of sync.\n"
+ "InnoDB: Unable to find the AUTOINC column "
+ "%s in the InnoDB table %s.\n"
+ "InnoDB: We set the next AUTOINC column "
+ "value to 0,\n"
+ "InnoDB: in effect disabling the AUTOINC "
+ "next value generation.\n"
+ "InnoDB: You can either set the next "
+ "AUTOINC value explicitly using ALTER TABLE\n"
+ "InnoDB: or fix the data dictionary by "
+ "recreating the table.\n",
+ col_name, index->table->name);
+
+ /* This will disable the AUTOINC generation. */
+ auto_inc = 0;
+
+ /* We want the open to succeed, so that the user can
+ take corrective action. ie. reads should succeed but
+ updates should fail. */
+ err = DB_SUCCESS;
+ break;
+ default:
+ /* row_search_max_autoinc() should only return
+ one of DB_SUCCESS or DB_RECORD_NOT_FOUND. */
+ ut_error;
+ }
}
dict_table_autoinc_initialize(prebuilt->table, auto_inc);
-
- return(DB_SUCCESS);
}
/*****************************************************************//**
@@ -3208,6 +3676,11 @@ retry:
primary_key = table->s->primary_key;
key_used_on_scan = primary_key;
+ if (!innobase_build_index_translation(table, ib_table, share)) {
+ sql_print_error("Build InnoDB index translation table for"
+ " Table %s failed", name);
+ }
+
/* Allocate a buffer for a 'row reference'. A row reference is
a string of bytes of length ref_length which uniquely specifies
a row in our table. Note that MySQL may also compare two row
@@ -3215,31 +3688,86 @@ retry:
of length ref_length! */
if (!row_table_got_default_clust_index(ib_table)) {
- if (primary_key >= MAX_KEY) {
- sql_print_error("Table %s has a primary key in InnoDB data "
- "dictionary, but not in MySQL!", name);
- }
prebuilt->clust_index_was_generated = FALSE;
- /* MySQL allocates the buffer for ref. key_info->key_length
- includes space for all key columns + one byte for each column
- that may be NULL. ref_length must be as exact as possible to
- save space, because all row reference buffers are allocated
- based on ref_length. */
+ if (UNIV_UNLIKELY(primary_key >= MAX_KEY)) {
+ sql_print_error("Table %s has a primary key in "
+ "InnoDB data dictionary, but not "
+ "in MySQL!", name);
- ref_length = table->key_info[primary_key].key_length;
+ /* This mismatch could cause further problems
+ if not attended, bring this to the user's attention
+ by printing a warning in addition to log a message
+ in the errorlog */
+ push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+ ER_NO_SUCH_INDEX,
+ "InnoDB: Table %s has a "
+ "primary key in InnoDB data "
+ "dictionary, but not in "
+ "MySQL!", name);
+
+ /* If primary_key >= MAX_KEY, its (primary_key)
+ value could be out of bound if continue to index
+ into key_info[] array. Find InnoDB primary index,
+ and assign its key_length to ref_length.
+ In addition, since MySQL indexes are sorted starting
+ with primary index, unique index etc., initialize
+ ref_length to the first index key length in
+ case we fail to find InnoDB cluster index.
+
+ Please note, this will not resolve the primary
+ index mismatch problem, other side effects are
+ possible if users continue to use the table.
+ However, we allow this table to be opened so
+ that user can adopt necessary measures for the
+ mismatch while still being accessible to the table
+ date. */
+ ref_length = table->key_info[0].key_length;
+
+ /* Find correspoinding cluster index
+ key length in MySQL's key_info[] array */
+ for (ulint i = 0; i < table->s->keys; i++) {
+ dict_index_t* index;
+ index = innobase_get_index(i);
+ if (dict_index_is_clust(index)) {
+ ref_length =
+ table->key_info[i].key_length;
+ }
+ }
+ } else {
+ /* MySQL allocates the buffer for ref.
+ key_info->key_length includes space for all key
+ columns + one byte for each column that may be
+ NULL. ref_length must be as exact as possible to
+ save space, because all row reference buffers are
+ allocated based on ref_length. */
+
+ ref_length = table->key_info[primary_key].key_length;
+ }
} else {
if (primary_key != MAX_KEY) {
- sql_print_error("Table %s has no primary key in InnoDB data "
- "dictionary, but has one in MySQL! If you "
- "created the table with a MySQL version < "
- "3.23.54 and did not define a primary key, "
- "but defined a unique key with all non-NULL "
- "columns, then MySQL internally treats that "
- "key as the primary key. You can fix this "
- "error by dump + DROP + CREATE + reimport "
- "of the table.", name);
+ sql_print_error(
+ "Table %s has no primary key in InnoDB data "
+ "dictionary, but has one in MySQL! If you "
+ "created the table with a MySQL version < "
+ "3.23.54 and did not define a primary key, "
+ "but defined a unique key with all non-NULL "
+ "columns, then MySQL internally treats that "
+ "key as the primary key. You can fix this "
+ "error by dump + DROP + CREATE + reimport "
+ "of the table.", name);
+
+ /* This mismatch could cause further problems
+ if not attended, bring this to the user attention
+ by printing a warning in addition to log a message
+ in the errorlog */
+ push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+ ER_NO_SUCH_INDEX,
+ "InnoDB: Table %s has no "
+ "primary key in InnoDB data "
+ "dictionary, but has one in "
+ "MySQL!", name);
}
prebuilt->clust_index_was_generated = TRUE;
@@ -3281,8 +3809,6 @@ retry:
/* Only if the table has an AUTOINC column. */
if (prebuilt->table != NULL && table->found_next_number_field != NULL) {
- ulint error;
-
dict_table_autoinc_lock(prebuilt->table);
/* Since a table can already be "open" in InnoDB's internal
@@ -3291,8 +3817,7 @@ retry:
autoinc value from a previous MySQL open. */
if (dict_table_autoinc_read(prebuilt->table) == 0) {
- error = innobase_initialize_autoinc();
- ut_a(error == DB_SUCCESS);
+ innobase_initialize_autoinc();
}
dict_table_autoinc_unlock(prebuilt->table);
@@ -4109,67 +4634,6 @@ skip_field:
}
/********************************************************************//**
-Get the upper limit of the MySQL integral and floating-point type. */
-UNIV_INTERN
-ulonglong
-ha_innobase::innobase_get_int_col_max_value(
-/*========================================*/
- const Field* field)
-{
- ulonglong max_value = 0;
-
- switch(field->key_type()) {
- /* TINY */
- case HA_KEYTYPE_BINARY:
- max_value = 0xFFULL;
- break;
- case HA_KEYTYPE_INT8:
- max_value = 0x7FULL;
- break;
- /* SHORT */
- case HA_KEYTYPE_USHORT_INT:
- max_value = 0xFFFFULL;
- break;
- case HA_KEYTYPE_SHORT_INT:
- max_value = 0x7FFFULL;
- break;
- /* MEDIUM */
- case HA_KEYTYPE_UINT24:
- max_value = 0xFFFFFFULL;
- break;
- case HA_KEYTYPE_INT24:
- max_value = 0x7FFFFFULL;
- break;
- /* LONG */
- case HA_KEYTYPE_ULONG_INT:
- max_value = 0xFFFFFFFFULL;
- break;
- case HA_KEYTYPE_LONG_INT:
- max_value = 0x7FFFFFFFULL;
- break;
- /* BIG */
- case HA_KEYTYPE_ULONGLONG:
- max_value = 0xFFFFFFFFFFFFFFFFULL;
- break;
- case HA_KEYTYPE_LONGLONG:
- max_value = 0x7FFFFFFFFFFFFFFFULL;
- break;
- case HA_KEYTYPE_FLOAT:
- /* We use the maximum as per IEEE754-2008 standard, 2^24 */
- max_value = 0x1000000ULL;
- break;
- case HA_KEYTYPE_DOUBLE:
- /* We use the maximum as per IEEE754-2008 standard, 2^53 */
- max_value = 0x20000000000000ULL;
- break;
- default:
- ut_error;
- }
-
- return(max_value);
-}
-
-/********************************************************************//**
This special handling is really to overcome the limitations of MySQL's
binlogging. We need to eliminate the non-determinism that will arise in
INSERT ... SELECT type of statements, since MySQL binlog only stores the
@@ -4394,11 +4858,17 @@ no_commit:
prebuilt->autoinc_error = DB_SUCCESS;
if ((error = update_auto_increment())) {
-
/* We don't want to mask autoinc overflow errors. */
- if (prebuilt->autoinc_error != DB_SUCCESS) {
- error = (int) prebuilt->autoinc_error;
+ /* Handle the case where the AUTOINC sub-system
+ failed during initialization. */
+ if (prebuilt->autoinc_error == DB_UNSUPPORTED) {
+ error_result = ER_AUTOINC_READ_FAILED;
+ /* Set the error message to report too. */
+ my_error(ER_AUTOINC_READ_FAILED, MYF(0));
+ goto func_exit;
+ } else if (prebuilt->autoinc_error != DB_SUCCESS) {
+ error = (int) prebuilt->autoinc_error;
goto report_error;
}
@@ -4479,24 +4949,29 @@ no_commit:
update the table upper limit. Note: last_value
will be 0 if get_auto_increment() was not called.*/
- if (auto_inc <= col_max_value
- && auto_inc >= prebuilt->autoinc_last_value) {
+ if (auto_inc >= prebuilt->autoinc_last_value) {
set_max_autoinc:
- ut_a(prebuilt->autoinc_increment > 0);
+ /* This should filter out the negative
+ values set explicitly by the user. */
+ if (auto_inc <= col_max_value) {
+ ut_a(prebuilt->autoinc_increment > 0);
- ulonglong need;
- ulonglong offset;
+ ulonglong need;
+ ulonglong offset;
- offset = prebuilt->autoinc_offset;
- need = prebuilt->autoinc_increment;
+ offset = prebuilt->autoinc_offset;
+ need = prebuilt->autoinc_increment;
- auto_inc = innobase_next_autoinc(
- auto_inc, need, offset, col_max_value);
+ auto_inc = innobase_next_autoinc(
+ auto_inc,
+ need, offset, col_max_value);
- err = innobase_set_max_autoinc(auto_inc);
+ err = innobase_set_max_autoinc(
+ auto_inc);
- if (err != DB_SUCCESS) {
- error = err;
+ if (err != DB_SUCCESS) {
+ error = err;
+ }
}
}
break;
@@ -4527,7 +5002,7 @@ calc_row_difference(
upd_t* uvect, /*!< in/out: update vector */
uchar* old_row, /*!< in: old row in MySQL format */
uchar* new_row, /*!< in: new row in MySQL format */
- TABLE* table, /*!< in: table in MySQL data
+ TABLE* table, /*!< in: table in MySQL data
dictionary */
uchar* upd_buff, /*!< in: buffer to use */
ulint buff_len, /*!< in: buffer length */
@@ -5176,14 +5651,30 @@ ha_innobase::innobase_get_index(
DBUG_ENTER("innobase_get_index");
ha_statistic_increment(&SSV::ha_read_key_count);
- ut_ad(user_thd == ha_thd());
- ut_a(prebuilt->trx == thd_to_trx(user_thd));
-
if (keynr != MAX_KEY && table->s->keys > 0) {
key = table->key_info + keynr;
- index = dict_table_get_index_on_name(prebuilt->table,
- key->name);
+ index = innobase_index_lookup(share, keynr);
+
+ if (index) {
+ ut_a(ut_strcmp(index->name, key->name) == 0);
+ } else {
+ /* Can't find index with keynr in the translation
+ table. Only print message if the index translation
+ table exists */
+ if (share->idx_trans_tbl.index_mapping) {
+ sql_print_error("InnoDB could not find "
+ "index %s key no %u for "
+ "table %s through its "
+ "index translation table",
+ key ? key->name : "NULL",
+ keynr,
+ prebuilt->table->name);
+ }
+
+ index = dict_table_get_index_on_name(prebuilt->table,
+ key->name);
+ }
} else {
index = dict_table_get_first_index(prebuilt->table);
}
@@ -5244,7 +5735,7 @@ ha_innobase::change_active_index(
dtuple_set_n_fields(prebuilt->search_tuple, prebuilt->index->n_fields);
dict_index_copy_types(prebuilt->search_tuple, prebuilt->index,
- prebuilt->index->n_fields);
+ prebuilt->index->n_fields);
/* MySQL changes the active index for a handle also during some
queries, for example SELECT MAX(a), SUM(a) first retrieves the MAX()
@@ -5745,9 +6236,11 @@ create_table_def(
if (error == DB_DUPLICATE_KEY) {
char buf[100];
- innobase_convert_identifier(buf, sizeof buf,
- table_name, strlen(table_name),
- trx->mysql_thd, TRUE);
+ char* buf_end = innobase_convert_identifier(
+ buf, sizeof buf - 1, table_name, strlen(table_name),
+ trx->mysql_thd, TRUE);
+
+ *buf_end = '\0';
my_error(ER_TABLE_EXISTS_ERROR, MYF(0), buf);
}
@@ -6349,6 +6842,10 @@ ha_innobase::create(
goto cleanup;
}
+ if (create_info->options & HA_LEX_CREATE_TMP_TABLE) {
+ flags |= DICT_TF2_TEMPORARY << DICT_TF2_SHIFT;
+ }
+
error = create_table_def(trx, form, norm_name,
create_info->options & HA_LEX_CREATE_TMP_TABLE ? name2 : NULL,
flags);
@@ -6870,10 +7367,15 @@ ha_innobase::records_in_range(
key = table->key_info + active_index;
- index = dict_table_get_index_on_name(prebuilt->table, key->name);
+ index = innobase_get_index(keynr);
- /* MySQL knows about this index and so we must be able to find it.*/
- ut_a(index);
+ /* There exists possibility of not being able to find requested
+ index due to inconsistency between MySQL and InoDB dictionary info.
+ Necessary message should have been printed in innobase_get_index() */
+ if (UNIV_UNLIKELY(!index)) {
+ n_rows = HA_POS_ERROR;
+ goto func_exit;
+ }
heap = mem_heap_create(2 * (key->key_parts * sizeof(dfield_t)
+ sizeof(dtuple_t)));
@@ -6918,6 +7420,7 @@ ha_innobase::records_in_range(
mem_heap_free(heap);
+func_exit:
my_free(key_val_buff2, MYF(0));
prebuilt->trx->op_info = (char*)"";
@@ -7059,6 +7562,7 @@ ha_innobase::info(
char path[FN_REFLEN];
os_file_stat_t stat_info;
+
DBUG_ENTER("info");
/* If we are forcing recovery at a high level, we will suppress
@@ -7219,13 +7723,29 @@ ha_innobase::info(
}
if (flag & HA_STATUS_CONST) {
- index = dict_table_get_first_index(ib_table);
+ /* Verify the number of index in InnoDB and MySQL
+ matches up. If prebuilt->clust_index_was_generated
+ holds, InnoDB defines GEN_CLUST_INDEX internally */
+ ulint num_innodb_index = UT_LIST_GET_LEN(ib_table->indexes)
+ - prebuilt->clust_index_was_generated;
- if (prebuilt->clust_index_was_generated) {
- index = dict_table_get_next_index(index);
+ if (table->s->keys != num_innodb_index) {
+ sql_print_error("Table %s contains %lu "
+ "indexes inside InnoDB, which "
+ "is different from the number of "
+ "indexes %u defined in the MySQL ",
+ ib_table->name, num_innodb_index,
+ table->s->keys);
}
for (i = 0; i < table->s->keys; i++) {
+ /* We could get index quickly through internal
+ index mapping with the index translation table.
+ The identity of index (match up index name with
+ that of table->key_info[i]) is already verified in
+ innobase_get_index(). */
+ index = innobase_get_index(i);
+
if (index == NULL) {
sql_print_error("Table %s contains fewer "
"indexes inside InnoDB than "
@@ -7277,8 +7797,6 @@ ha_innobase::info(
rec_per_key >= ~(ulong) 0 ? ~(ulong) 0 :
(ulong) rec_per_key;
}
-
- index = dict_table_get_next_index(index);
}
}
@@ -7320,12 +7838,12 @@ ha_innobase::analyze(
{
/* Serialize ANALYZE TABLE inside InnoDB, see
Bug#38996 Race condition in ANALYZE TABLE */
- pthread_mutex_lock(&analyze_mutex);
+ mysql_mutex_lock(&analyze_mutex);
/* Simply call ::info() with all the flags */
info(HA_STATUS_TIME | HA_STATUS_CONST | HA_STATUS_VARIABLE);
- pthread_mutex_unlock(&analyze_mutex);
+ mysql_mutex_unlock(&analyze_mutex);
return(0);
}
@@ -7356,8 +7874,13 @@ ha_innobase::check(
HA_CHECK_OPT* check_opt) /*!< in: check options, currently
ignored */
{
- ulint ret;
+ dict_index_t* index;
+ ulint n_rows;
+ ulint n_rows_in_table = ULINT_UNDEFINED;
+ ibool is_ok = TRUE;
+ ulint old_isolation_level;
+ DBUG_ENTER("ha_innobase::check");
DBUG_ASSERT(thd == ha_thd());
ut_a(prebuilt->trx);
ut_a(prebuilt->trx->magic_n == TRX_MAGIC_N);
@@ -7370,17 +7893,140 @@ ha_innobase::check(
build_template(prebuilt, NULL, table, ROW_MYSQL_WHOLE_ROW);
}
- ret = row_check_table_for_mysql(prebuilt);
+ if (prebuilt->table->ibd_file_missing) {
+ sql_print_error("InnoDB: Error:\n"
+ "InnoDB: MySQL is trying to use a table handle"
+ " but the .ibd file for\n"
+ "InnoDB: table %s does not exist.\n"
+ "InnoDB: Have you deleted the .ibd file"
+ " from the database directory under\n"
+ "InnoDB: the MySQL datadir, or have you"
+ " used DISCARD TABLESPACE?\n"
+ "InnoDB: Please refer to\n"
+ "InnoDB: " REFMAN "innodb-troubleshooting.html\n"
+ "InnoDB: how you can resolve the problem.\n",
+ prebuilt->table->name);
+ DBUG_RETURN(HA_ADMIN_CORRUPT);
+ }
- switch (ret) {
- case DB_SUCCESS:
- return(HA_ADMIN_OK);
- case DB_INTERRUPTED:
+ prebuilt->trx->op_info = "checking table";
+
+ old_isolation_level = prebuilt->trx->isolation_level;
+
+ /* We must run the index record counts at an isolation level
+ >= READ COMMITTED, because a dirty read can see a wrong number
+ of records in some index; to play safe, we use always
+ REPEATABLE READ here */
+
+ prebuilt->trx->isolation_level = TRX_ISO_REPEATABLE_READ;
+
+ /* Enlarge the fatal lock wait timeout during CHECK TABLE. */
+ mutex_enter(&kernel_mutex);
+ srv_fatal_semaphore_wait_threshold += 7200; /* 2 hours */
+ mutex_exit(&kernel_mutex);
+
+ for (index = dict_table_get_first_index(prebuilt->table);
+ index != NULL;
+ index = dict_table_get_next_index(index)) {
+#if 0
+ fputs("Validating index ", stderr);
+ ut_print_name(stderr, trx, FALSE, index->name);
+ putc('\n', stderr);
+#endif
+
+ if (!btr_validate_index(index, prebuilt->trx)) {
+ is_ok = FALSE;
+ push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+ ER_NOT_KEYFILE,
+ "InnoDB: The B-tree of"
+ " index '%-.200s' is corrupted.",
+ index->name);
+ continue;
+ }
+
+ /* Instead of invoking change_active_index(), set up
+ a dummy template for non-locking reads, disabling
+ access to the clustered index. */
+ prebuilt->index = index;
+
+ prebuilt->index_usable = row_merge_is_index_usable(
+ prebuilt->trx, prebuilt->index);
+
+ if (UNIV_UNLIKELY(!prebuilt->index_usable)) {
+ push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+ HA_ERR_TABLE_DEF_CHANGED,
+ "InnoDB: Insufficient history for"
+ " index '%-.200s'",
+ index->name);
+ continue;
+ }
+
+ prebuilt->sql_stat_start = TRUE;
+ prebuilt->template_type = ROW_MYSQL_DUMMY_TEMPLATE;
+ prebuilt->n_template = 0;
+ prebuilt->need_to_access_clustered = FALSE;
+
+ dtuple_set_n_fields(prebuilt->search_tuple, 0);
+
+ prebuilt->select_lock_type = LOCK_NONE;
+
+ if (!row_check_index_for_mysql(prebuilt, index, &n_rows)) {
+ push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+ ER_NOT_KEYFILE,
+ "InnoDB: The B-tree of"
+ " index '%-.200s' is corrupted.",
+ index->name);
+ is_ok = FALSE;
+ }
+
+ if (thd_killed(user_thd)) {
+ break;
+ }
+
+#if 0
+ fprintf(stderr, "%lu entries in index %s\n", n_rows,
+ index->name);
+#endif
+
+ if (index == dict_table_get_first_index(prebuilt->table)) {
+ n_rows_in_table = n_rows;
+ } else if (n_rows != n_rows_in_table) {
+ push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+ ER_NOT_KEYFILE,
+ "InnoDB: Index '%-.200s'"
+ " contains %lu entries,"
+ " should be %lu.",
+ index->name,
+ (ulong) n_rows,
+ (ulong) n_rows_in_table);
+ is_ok = FALSE;
+ }
+ }
+
+ /* Restore the original isolation level */
+ prebuilt->trx->isolation_level = old_isolation_level;
+
+ /* We validate also the whole adaptive hash index for all tables
+ at every CHECK TABLE */
+
+ if (!btr_search_validate()) {
+ push_warning(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+ ER_NOT_KEYFILE,
+ "InnoDB: The adaptive hash index is corrupted.");
+ is_ok = FALSE;
+ }
+
+ /* Restore the fatal lock wait timeout after CHECK TABLE. */
+ mutex_enter(&kernel_mutex);
+ srv_fatal_semaphore_wait_threshold -= 7200; /* 2 hours */
+ mutex_exit(&kernel_mutex);
+
+ prebuilt->trx->op_info = "";
+ if (thd_killed(user_thd)) {
my_error(ER_QUERY_INTERRUPTED, MYF(0));
- return(-1);
- default:
- return(HA_ADMIN_CORRUPT);
}
+
+ DBUG_RETURN(is_ok ? HA_ADMIN_OK : HA_ADMIN_CORRUPT);
}
/*************************************************************//**
@@ -7927,7 +8573,6 @@ ha_innobase::external_lock(
ulong const tx_isolation = thd_tx_isolation(ha_thd());
if (tx_isolation <= ISO_READ_COMMITTED
&& binlog_format == BINLOG_FORMAT_STMT
- && !(table_flags() & HA_BINLOG_STMT_CAPABLE)
#if MYSQL_VERSION_ID > 50140
&& thd_binlog_filter_ok(thd)
#endif /* MYSQL_VERSION_ID > 50140 */
@@ -7939,15 +8584,8 @@ ha_innobase::external_lock(
" InnoDB is not safe for binlog mode '%s'",
tx_isolation_names[tx_isolation],
binlog_format_names[binlog_format]);
- /* The error may be suppressed by test cases, by setting
- the no_innodb_binlog_errors debug symbol. */
- if (DBUG_EVALUATE_IF("no_innodb_binlog_errors", 0, 1)) {
- my_error(ER_BINLOG_STMT_MODE_AND_ROW_ENGINE, MYF(0),
- " InnoDB is limited to row-logging when "
- "transaction isolation level is "
- "READ COMMITTED or READ UNCOMMITTED.");
- DBUG_RETURN(HA_ERR_LOGGING_IMPOSSIBLE);
- }
+ my_error(ER_BINLOG_LOGGING_IMPOSSIBLE, MYF(0), buf);
+ DBUG_RETURN(HA_ERR_LOGGING_IMPOSSIBLE);
}
}
@@ -8211,8 +8849,8 @@ innodb_show_status(
mutex_enter(&srv_monitor_file_mutex);
rewind(srv_monitor_file);
- srv_printf_innodb_monitor(srv_monitor_file,
- &trx_list_start, &trx_list_end);
+ srv_printf_innodb_monitor(srv_monitor_file, FALSE,
+ &trx_list_start, &trx_list_end);
flen = ftell(srv_monitor_file);
os_file_set_eof(srv_monitor_file);
@@ -8230,8 +8868,8 @@ innodb_show_status(
read the contents of the temporary file */
if (!(str = (char*) my_malloc(usable_len + 1, MYF(0)))) {
- mutex_exit(&srv_monitor_file_mutex);
- DBUG_RETURN(TRUE);
+ mutex_exit(&srv_monitor_file_mutex);
+ DBUG_RETURN(TRUE);
}
rewind(srv_monitor_file);
@@ -8269,19 +8907,25 @@ innodb_show_status(
}
/************************************************************************//**
-Implements the SHOW MUTEX STATUS command. . */
+Implements the SHOW MUTEX STATUS command.
+@return TRUE on failure, FALSE on success. */
static
bool
innodb_mutex_show_status(
/*=====================*/
- handlerton* hton, /*!< in: the innodb handlerton */
+ handlerton* hton, /*!< in: the innodb handlerton */
THD* thd, /*!< in: the MySQL query thread of the
caller */
- stat_print_fn* stat_print)
+ stat_print_fn* stat_print) /*!< in: function for printing
+ statistics */
{
char buf1[IO_SIZE], buf2[IO_SIZE];
mutex_t* mutex;
rw_lock_t* lock;
+ ulint block_mutex_oswait_count = 0;
+ ulint block_lock_oswait_count = 0;
+ mutex_t* block_mutex = NULL;
+ rw_lock_t* block_lock = NULL;
#ifdef UNIV_DEBUG
ulint rw_lock_count= 0;
ulint rw_lock_count_spin_loop= 0;
@@ -8296,12 +8940,16 @@ innodb_mutex_show_status(
mutex_enter(&mutex_list_mutex);
- mutex = UT_LIST_GET_FIRST(mutex_list);
+ for (mutex = UT_LIST_GET_FIRST(mutex_list); mutex != NULL;
+ mutex = UT_LIST_GET_NEXT(list, mutex)) {
+ if (mutex->count_os_wait == 0) {
+ continue;
+ }
- while (mutex != NULL) {
- if (mutex->count_os_wait == 0
- || buf_pool_is_block_mutex(mutex)) {
- goto next_mutex;
+ if (buf_pool_is_block_mutex(mutex)) {
+ block_mutex = mutex;
+ block_mutex_oswait_count += mutex->count_os_wait;
+ continue;
}
#ifdef UNIV_DEBUG
if (mutex->mutex_type != 1) {
@@ -8328,8 +8976,7 @@ innodb_mutex_show_status(
DBUG_RETURN(1);
}
}
- }
- else {
+ } else {
rw_lock_count += mutex->count_using;
rw_lock_count_spin_loop += mutex->count_spin_loop;
rw_lock_count_spin_rounds += mutex->count_spin_rounds;
@@ -8341,7 +8988,7 @@ innodb_mutex_show_status(
buf1len= (uint) my_snprintf(buf1, sizeof(buf1), "%s:%lu",
mutex->cfile_name, (ulong) mutex->cline);
buf2len= (uint) my_snprintf(buf2, sizeof(buf2), "os_waits=%lu",
- mutex->count_os_wait);
+ (ulong) mutex->count_os_wait);
if (stat_print(thd, innobase_hton_name,
hton_name_len, buf1, buf1len,
@@ -8350,45 +8997,83 @@ innodb_mutex_show_status(
DBUG_RETURN(1);
}
#endif /* UNIV_DEBUG */
+ }
-next_mutex:
- mutex = UT_LIST_GET_NEXT(list, mutex);
+ if (block_mutex) {
+ buf1len = (uint) my_snprintf(buf1, sizeof buf1,
+ "combined %s:%lu",
+ block_mutex->cfile_name,
+ (ulong) block_mutex->cline);
+ buf2len = (uint) my_snprintf(buf2, sizeof buf2,
+ "os_waits=%lu",
+ (ulong) block_mutex_oswait_count);
+
+ if (stat_print(thd, innobase_hton_name,
+ hton_name_len, buf1, buf1len,
+ buf2, buf2len)) {
+ mutex_exit(&mutex_list_mutex);
+ DBUG_RETURN(1);
+ }
}
mutex_exit(&mutex_list_mutex);
mutex_enter(&rw_lock_list_mutex);
- lock = UT_LIST_GET_FIRST(rw_lock_list);
-
- while (lock != NULL) {
- if (lock->count_os_wait
- && !buf_pool_is_block_lock(lock)) {
- buf1len= my_snprintf(buf1, sizeof(buf1), "%s:%lu",
- lock->cfile_name, (ulong) lock->cline);
- buf2len= my_snprintf(buf2, sizeof(buf2),
- "os_waits=%lu", lock->count_os_wait);
-
- if (stat_print(thd, innobase_hton_name,
- hton_name_len, buf1, buf1len,
- buf2, buf2len)) {
- mutex_exit(&rw_lock_list_mutex);
- DBUG_RETURN(1);
- }
+ for (lock = UT_LIST_GET_FIRST(rw_lock_list); lock != NULL;
+ lock = UT_LIST_GET_NEXT(list, lock)) {
+ if (lock->count_os_wait == 0) {
+ continue;
+ }
+
+ if (buf_pool_is_block_lock(lock)) {
+ block_lock = lock;
+ block_lock_oswait_count += lock->count_os_wait;
+ continue;
+ }
+
+ buf1len = my_snprintf(buf1, sizeof buf1, "%s:%lu",
+ lock->cfile_name, (ulong) lock->cline);
+ buf2len = my_snprintf(buf2, sizeof buf2, "os_waits=%lu",
+ (ulong) lock->count_os_wait);
+
+ if (stat_print(thd, innobase_hton_name,
+ hton_name_len, buf1, buf1len,
+ buf2, buf2len)) {
+ mutex_exit(&rw_lock_list_mutex);
+ DBUG_RETURN(1);
+ }
+ }
+
+ if (block_lock) {
+ buf1len = (uint) my_snprintf(buf1, sizeof buf1,
+ "combined %s:%lu",
+ block_lock->cfile_name,
+ (ulong) block_lock->cline);
+ buf2len = (uint) my_snprintf(buf2, sizeof buf2,
+ "os_waits=%lu",
+ (ulong) block_lock_oswait_count);
+
+ if (stat_print(thd, innobase_hton_name,
+ hton_name_len, buf1, buf1len,
+ buf2, buf2len)) {
+ mutex_exit(&rw_lock_list_mutex);
+ DBUG_RETURN(1);
}
- lock = UT_LIST_GET_NEXT(list, lock);
}
mutex_exit(&rw_lock_list_mutex);
#ifdef UNIV_DEBUG
- buf2len= my_snprintf(buf2, sizeof(buf2),
- "count=%lu, spin_waits=%lu, spin_rounds=%lu, "
- "os_waits=%lu, os_yields=%lu, os_wait_times=%lu",
- rw_lock_count, rw_lock_count_spin_loop,
- rw_lock_count_spin_rounds,
- rw_lock_count_os_wait, rw_lock_count_os_yield,
- (ulong) (rw_lock_wait_time/1000));
+ buf2len = my_snprintf(buf2, sizeof buf2,
+ "count=%lu, spin_waits=%lu, spin_rounds=%lu, "
+ "os_waits=%lu, os_yields=%lu, os_wait_times=%lu",
+ (ulong) rw_lock_count,
+ (ulong) rw_lock_count_spin_loop,
+ (ulong) rw_lock_count_spin_rounds,
+ (ulong) rw_lock_count_os_wait,
+ (ulong) rw_lock_count_os_yield,
+ (ulong) (rw_lock_wait_time / 1000));
if (stat_print(thd, innobase_hton_name, hton_name_len,
STRING_WITH_LEN("rw_lock_mutexes"), buf2, buf2len)) {
@@ -8424,7 +9109,7 @@ bool innobase_show_status(handlerton *hton, THD* thd,
static INNOBASE_SHARE* get_share(const char* table_name)
{
INNOBASE_SHARE *share;
- pthread_mutex_lock(&innobase_share_mutex);
+ mysql_mutex_lock(&innobase_share_mutex);
ulint fold = ut_fold_string(table_name);
@@ -8450,17 +9135,22 @@ static INNOBASE_SHARE* get_share(const char* table_name)
innobase_open_tables, fold, share);
thr_lock_init(&share->lock);
+
+ /* Index translation table initialization */
+ share->idx_trans_tbl.index_mapping = NULL;
+ share->idx_trans_tbl.index_count = 0;
+ share->idx_trans_tbl.array_size = 0;
}
share->use_count++;
- pthread_mutex_unlock(&innobase_share_mutex);
+ mysql_mutex_unlock(&innobase_share_mutex);
return(share);
}
static void free_share(INNOBASE_SHARE* share)
{
- pthread_mutex_lock(&innobase_share_mutex);
+ mysql_mutex_lock(&innobase_share_mutex);
#ifdef UNIV_DEBUG
INNOBASE_SHARE* share2;
@@ -8480,13 +9170,18 @@ static void free_share(INNOBASE_SHARE* share)
HASH_DELETE(INNOBASE_SHARE, table_name_hash,
innobase_open_tables, fold, share);
thr_lock_delete(&share->lock);
+
+ /* Free any memory from index translation table */
+ my_free(share->idx_trans_tbl.index_mapping,
+ MYF(MY_ALLOW_ZERO_PTR));
+
my_free(share, MYF(0));
/* TODO: invoke HASH_MIGRATE if innobase_open_tables
shrinks too much */
}
- pthread_mutex_unlock(&innobase_share_mutex);
+ mysql_mutex_unlock(&innobase_share_mutex);
}
/*****************************************************************//**
@@ -8714,7 +9409,10 @@ ha_innobase::innobase_get_autoinc(
*value = dict_table_autoinc_read(prebuilt->table);
/* It should have been initialized during open. */
- ut_a(*value != 0);
+ if (*value == 0) {
+ prebuilt->autoinc_error = DB_UNSUPPORTED;
+ dict_table_autoinc_unlock(prebuilt->table);
+ }
}
return(prebuilt->autoinc_error);
@@ -8794,6 +9492,11 @@ ha_innobase::get_auto_increment(
invoking this method. So we are not sure if it's guaranteed to
be 0 or not. */
+ /* We need the upper limit of the col type to check for
+ whether we update the table autoinc counter or not. */
+ ulonglong col_max_value = innobase_get_int_col_max_value(
+ table->next_number_field);
+
/* Called for the first time ? */
if (trx->n_autoinc_rows == 0) {
@@ -8810,6 +9513,11 @@ ha_innobase::get_auto_increment(
/* Not in the middle of a mult-row INSERT. */
} else if (prebuilt->autoinc_last_value == 0) {
set_if_bigger(*first_value, autoinc);
+ /* Check for -ve values. */
+ } else if (*first_value > col_max_value && trx->n_autoinc_rows > 0) {
+ /* Set to next logical value. */
+ ut_a(autoinc > trx->n_autoinc_rows);
+ *first_value = (autoinc - trx->n_autoinc_rows) - 1;
}
*nb_reserved_values = trx->n_autoinc_rows;
@@ -8820,12 +9528,6 @@ ha_innobase::get_auto_increment(
ulonglong need;
ulonglong current;
ulonglong next_value;
- ulonglong col_max_value;
-
- /* We need the upper limit of the col type to check for
- whether we update the table autoinc counter or not. */
- col_max_value = innobase_get_int_col_max_value(
- table->next_number_field);
current = *first_value > col_max_value ? autoinc : *first_value;
need = *nb_reserved_values * increment;
@@ -9188,7 +9890,7 @@ innobase_xa_prepare(
In this case we cannot know how many minutes or hours
will be between XA PREPARE and XA COMMIT, and we don't want
to block for undefined period of time. */
- pthread_mutex_lock(&prepare_commit_mutex);
+ mysql_mutex_lock(&prepare_commit_mutex);
trx->active_trans = 2;
}
@@ -9322,33 +10024,60 @@ innobase_set_cursor_view(
(cursor_view_t*) curview);
}
+/*******************************************************************//**
+If col_name is not NULL, check whether the named column is being
+renamed in the table. If col_name is not provided, check
+whether any one of columns in the table is being renamed.
+@return true if the column is being renamed */
+static
+bool
+check_column_being_renamed(
+/*=======================*/
+ const TABLE* table, /*!< in: MySQL table */
+ const char* col_name) /*!< in: name of the column */
+{
+ uint k;
+ Field* field;
+
+ for (k = 0; k < table->s->fields; k++) {
+ field = table->field[k];
-/***********************************************************************
-Check whether any of the given columns is being renamed in the table. */
+ if (field->flags & FIELD_IS_RENAMED) {
+
+ /* If col_name is not provided, return
+ if the field is marked as being renamed. */
+ if (!col_name) {
+ return(true);
+ }
+
+ /* If col_name is provided, return only
+ if names match */
+ if (innobase_strcasecmp(field->field_name,
+ col_name) == 0) {
+ return(true);
+ }
+ }
+ }
+
+ return(false);
+}
+
+/*******************************************************************//**
+Check whether any of the given columns is being renamed in the table.
+@return true if any of col_names is being renamed in table */
static
bool
column_is_being_renamed(
/*====================*/
- /* out: true if any of col_names is
- being renamed in table */
- TABLE* table, /* in: MySQL table */
- uint n_cols, /* in: number of columns */
- const char** col_names) /* in: names of the columns */
+ TABLE* table, /*!< in: MySQL table */
+ uint n_cols, /*!< in: number of columns */
+ const char** col_names) /*!< in: names of the columns */
{
uint j;
- uint k;
- Field* field;
- const char* col_name;
for (j = 0; j < n_cols; j++) {
- col_name = col_names[j];
- for (k = 0; k < table->s->fields; k++) {
- field = table->field[k];
- if ((field->flags & FIELD_IS_RENAMED)
- && innobase_strcasecmp(field->field_name,
- col_name) == 0) {
- return(true);
- }
+ if (check_column_being_renamed(table, col_names[j])) {
+ return(true);
}
}
@@ -9432,6 +10161,15 @@ ha_innobase::check_if_incompatible_data(
return(COMPATIBLE_DATA_NO);
}
+ /* For column rename operation, MySQL does not supply enough
+ information (new column name etc.) for InnoDB to make appropriate
+ system metadata change. To avoid system metadata inconsistency,
+ currently we can just request a table rebuild/copy by returning
+ COMPATIBLE_DATA_NO */
+ if (check_column_being_renamed(table, NULL)) {
+ return COMPATIBLE_DATA_NO;
+ }
+
/* Check if a column participating in a foreign key is being renamed.
There is no mechanism for updating InnoDB foreign key definitions. */
if (foreign_key_column_is_being_renamed(prebuilt, table)) {
@@ -9936,6 +10674,23 @@ static MYSQL_SYSVAR_ULONG(io_capacity, srv_io_capacity,
"Number of IOPs the server can do. Tunes the background IO rate",
NULL, NULL, 200, 100, ~0L, 0);
+static MYSQL_SYSVAR_ULONG(purge_batch_size, srv_purge_batch_size,
+ PLUGIN_VAR_OPCMDARG,
+ "Number of UNDO logs to purge in one batch from the history list. "
+ "Default is 20",
+ NULL, NULL,
+ 20, /* Default setting */
+ 1, /* Minimum value */
+ 5000, 0); /* Maximum value */
+
+static MYSQL_SYSVAR_ULONG(purge_threads, srv_n_purge_threads,
+ PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
+ "Purge threads can be either 0 or 1. Default is 0.",
+ NULL, NULL,
+ 0, /* Default setting */
+ 0, /* Minimum value */
+ 1, 0); /* Maximum value */
+
static MYSQL_SYSVAR_ULONG(fast_shutdown, innobase_fast_shutdown,
PLUGIN_VAR_OPCMDARG,
"Speeds up the shutdown process of the InnoDB storage engine. Possible "
@@ -10174,6 +10929,11 @@ static MYSQL_SYSVAR_BOOL(use_sys_malloc, srv_use_sys_malloc,
"Use OS memory allocator instead of InnoDB's internal memory allocator",
NULL, NULL, TRUE);
+static MYSQL_SYSVAR_BOOL(use_native_aio, srv_use_native_aio,
+ PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
+ "Use native AIO if supported on this platform.",
+ NULL, NULL, TRUE);
+
static MYSQL_SYSVAR_STR(change_buffering, innobase_change_buffering,
PLUGIN_VAR_RQCMDARG,
"Buffer changes to reduce random access: "
@@ -10183,7 +10943,7 @@ static MYSQL_SYSVAR_STR(change_buffering, innobase_change_buffering,
static MYSQL_SYSVAR_ULONG(read_ahead_threshold, srv_read_ahead_threshold,
PLUGIN_VAR_RQCMDARG,
- "Number of pages that must be accessed sequentially for InnoDB to"
+ "Number of pages that must be accessed sequentially for InnoDB to "
"trigger a readahead.",
NULL, NULL, 56, 0, 64, 0);
@@ -10240,9 +11000,12 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
MYSQL_SYSVAR(autoinc_lock_mode),
MYSQL_SYSVAR(version),
MYSQL_SYSVAR(use_sys_malloc),
+ MYSQL_SYSVAR(use_native_aio),
MYSQL_SYSVAR(change_buffering),
MYSQL_SYSVAR(read_ahead_threshold),
MYSQL_SYSVAR(io_capacity),
+ MYSQL_SYSVAR(purge_threads),
+ MYSQL_SYSVAR(purge_batch_size),
NULL
};
diff --git a/storage/innobase/handler/ha_innodb.h b/storage/innobase/handler/ha_innodb.h
index 31e88ed8530..8a3e1ccff82 100644
--- a/storage/innobase/handler/ha_innodb.h
+++ b/storage/innobase/handler/ha_innodb.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 2000, 2009, MySQL AB & Innobase Oy. All Rights Reserved.
+Copyright (c) 2000, 2010, MySQL AB & Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -27,15 +27,31 @@ Place, Suite 330, Boston, MA 02111-1307 USA
#pragma interface /* gcc class implementation */
#endif
+/* Structure defines translation table between mysql index and innodb
+index structures */
+typedef struct innodb_idx_translate_struct {
+ ulint index_count; /*!< number of valid index entries
+ in the index_mapping array */
+ ulint array_size; /*!< array size of index_mapping */
+ dict_index_t** index_mapping; /*!< index pointer array directly
+ maps to index in Innodb from MySQL
+ array index */
+} innodb_idx_translate_t;
+
+
/** InnoDB table share */
typedef struct st_innobase_share {
- THR_LOCK lock; /*!< MySQL lock protecting
- this structure */
- const char* table_name; /*!< InnoDB table name */
- uint use_count; /*!< reference count,
- incremented in get_share()
- and decremented in free_share() */
- void* table_name_hash;/*!< hash table chain node */
+ THR_LOCK lock; /*!< MySQL lock protecting
+ this structure */
+ const char* table_name; /*!< InnoDB table name */
+ uint use_count; /*!< reference count,
+ incremented in get_share()
+ and decremented in
+ free_share() */
+ void* table_name_hash;/*!< hash table chain node */
+ innodb_idx_translate_t idx_trans_tbl; /*!< index translation
+ table between MySQL and
+ Innodb */
} INNOBASE_SHARE;
@@ -91,9 +107,8 @@ class ha_innobase: public handler
ulint innobase_reset_autoinc(ulonglong auto_inc);
ulint innobase_get_autoinc(ulonglong* value);
ulint innobase_update_autoinc(ulonglong auto_inc);
- ulint innobase_initialize_autoinc();
+ void innobase_initialize_autoinc();
dict_index_t* innobase_get_index(uint keynr);
- ulonglong innobase_get_int_col_max_value(const Field* field);
/* Init values for the class: */
public:
diff --git a/storage/innobase/handler/handler0alter.cc b/storage/innobase/handler/handler0alter.cc
index 99c1d37b04c..9836fb11ebc 100644
--- a/storage/innobase/handler/handler0alter.cc
+++ b/storage/innobase/handler/handler0alter.cc
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 2005, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2005, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -21,10 +21,8 @@ Place, Suite 330, Boston, MA 02111-1307 USA
Smart ALTER TABLE
*******************************************************/
-#include <unireg.h>
+#include <mysql_priv.h>
#include <mysqld_error.h>
-#include <sql_lex.h> // SQLCOM_CREATE_INDEX
-#include <mysql/innodb_priv.h>
extern "C" {
#include "log0log.h"
@@ -231,9 +229,11 @@ static
int
innobase_check_index_keys(
/*======================*/
- const KEY* key_info, /*!< in: Indexes to be created */
- ulint num_of_keys) /*!< in: Number of indexes to
- be created */
+ const KEY* key_info, /*!< in: Indexes to be
+ created */
+ ulint num_of_keys, /*!< in: Number of
+ indexes to be created */
+ const dict_table_t* table) /*!< in: Existing indexes */
{
ulint key_num;
@@ -250,9 +250,22 @@ innobase_check_index_keys(
const KEY& key2 = key_info[i];
if (0 == strcmp(key.name, key2.name)) {
- sql_print_error("InnoDB: key name `%s` appears"
- " twice in CREATE INDEX\n",
- key.name);
+ my_error(ER_WRONG_NAME_FOR_INDEX, MYF(0),
+ key.name);
+
+ return(ER_WRONG_NAME_FOR_INDEX);
+ }
+ }
+
+ /* Check that the same index name does not already exist. */
+
+ for (const dict_index_t* index
+ = dict_table_get_first_index(table);
+ index; index = dict_table_get_next_index(index)) {
+
+ if (0 == strcmp(key.name, index->name)) {
+ my_error(ER_WRONG_NAME_FOR_INDEX, MYF(0),
+ key.name);
return(ER_WRONG_NAME_FOR_INDEX);
}
@@ -260,7 +273,7 @@ innobase_check_index_keys(
/* Check that MySQL does not try to create a column
prefix index field on an inappropriate data type and
- that the same colum does not appear twice in the index. */
+ that the same column does not appear twice in the index. */
for (ulint i = 0; i < key.key_parts; i++) {
const KEY_PART_INFO& key_part1
@@ -291,14 +304,8 @@ innobase_check_index_keys(
}
}
- sql_print_error("InnoDB: MySQL is trying to"
- " create a column prefix"
- " index field on an"
- " inappropriate data type."
- " column `%s`,"
- " index `%s`.\n",
- field->field_name,
- key.name);
+ my_error(ER_WRONG_KEY_COLUMN, MYF(0),
+ field->field_name);
return(ER_WRONG_KEY_COLUMN);
}
@@ -311,11 +318,8 @@ innobase_check_index_keys(
continue;
}
- sql_print_error("InnoDB: column `%s`"
- " is not allowed to occur"
- " twice in index `%s`.\n",
- key_part1.field->field_name,
- key.name);
+ my_error(ER_WRONG_KEY_COLUMN, MYF(0),
+ key_part1.field->field_name);
return(ER_WRONG_KEY_COLUMN);
}
}
@@ -524,12 +528,14 @@ innobase_create_key_def(
key_info->name, "PRIMARY");
/* If there is a UNIQUE INDEX consisting entirely of NOT NULL
- columns, MySQL will treat it as a PRIMARY KEY unless the
- table already has one. */
+ columns and if the index does not contain column prefix(es)
+ (only prefix/part of the column is indexed), MySQL will treat the
+ index as a PRIMARY KEY unless the table already has one. */
if (!new_primary && (key_info->flags & HA_NOSAME)
+ && (!(key_info->flags & HA_KEY_HAS_PART_KEY_SEG))
&& row_table_got_default_clust_index(table)) {
- uint key_part = key_info->key_parts;
+ uint key_part = key_info->key_parts;
new_primary = TRUE;
@@ -658,12 +664,18 @@ ha_innobase::add_index(
innodb_table = indexed_table
= dict_table_get(prebuilt->table->name, FALSE);
+ if (UNIV_UNLIKELY(!innodb_table)) {
+ error = HA_ERR_NO_SUCH_TABLE;
+ goto err_exit;
+ }
+
/* Check if the index name is reserved. */
if (innobase_index_name_is_reserved(trx, key_info, num_of_keys)) {
error = -1;
} else {
/* Check that index keys are sensible */
- error = innobase_check_index_keys(key_info, num_of_keys);
+ error = innobase_check_index_keys(key_info, num_of_keys,
+ innodb_table);
}
if (UNIV_UNLIKELY(error)) {
@@ -710,6 +722,8 @@ err_exit:
row_mysql_lock_data_dictionary(trx);
dict_locked = TRUE;
+ ut_d(dict_table_check_for_dup_indexes(innodb_table, FALSE));
+
/* If a new primary key is defined for the table we need
to drop the original table and rebuild all indexes. */
@@ -742,6 +756,8 @@ err_exit:
user_thd);
}
+ ut_d(dict_table_check_for_dup_indexes(innodb_table,
+ FALSE));
row_mysql_unlock_data_dictionary(trx);
goto err_exit;
}
@@ -766,6 +782,10 @@ err_exit:
ut_ad(error == DB_SUCCESS);
+ /* We will need to rebuild index translation table. Set
+ valid index entry count in the translation table to zero */
+ share->idx_trans_tbl.index_count = 0;
+
/* Commit the data dictionary transaction in order to release
the table locks on the system tables. This means that if
MySQL crashes while creating a new primary key inside
@@ -801,18 +821,6 @@ err_exit:
index, num_of_idx, table);
error_handling:
-#ifdef UNIV_DEBUG
- /* TODO: At the moment we can't handle the following statement
- in our debugging code below:
-
- alter table t drop index b, add index (b);
-
- The fix will have to parse the SQL and note that the index
- being added has the same name as the one being dropped and
- ignore that in the dup index check.*/
- //dict_table_check_for_dup_indexes(prebuilt->table);
-#endif
-
/* After an error, remove all those index definitions from the
dictionary which were defined. */
@@ -824,6 +832,8 @@ error_handling:
row_mysql_lock_data_dictionary(trx);
dict_locked = TRUE;
+ ut_d(dict_table_check_for_dup_indexes(prebuilt->table, TRUE));
+
if (!new_primary) {
error = row_merge_rename_indexes(trx, indexed_table);
@@ -910,6 +920,8 @@ convert_error:
trx_commit_for_mysql(prebuilt->trx);
}
+ ut_d(dict_table_check_for_dup_indexes(innodb_table, FALSE));
+
if (dict_locked) {
row_mysql_unlock_data_dictionary(trx);
}
@@ -953,6 +965,7 @@ ha_innobase::prepare_drop_index(
/* Test and mark all the indexes to be dropped */
row_mysql_lock_data_dictionary(trx);
+ ut_d(dict_table_check_for_dup_indexes(prebuilt->table, FALSE));
/* Check that none of the indexes have previously been flagged
for deletion. */
@@ -1118,6 +1131,7 @@ func_exit:
} while (index);
}
+ ut_d(dict_table_check_for_dup_indexes(prebuilt->table, FALSE));
row_mysql_unlock_data_dictionary(trx);
DBUG_RETURN(err);
@@ -1164,6 +1178,7 @@ ha_innobase::final_drop_index(
prebuilt->table->flags, user_thd);
row_mysql_lock_data_dictionary(trx);
+ ut_d(dict_table_check_for_dup_indexes(prebuilt->table, FALSE));
if (UNIV_UNLIKELY(err)) {
@@ -1200,11 +1215,12 @@ ha_innobase::final_drop_index(
ut_a(!index->to_be_dropped);
}
-#ifdef UNIV_DEBUG
- dict_table_check_for_dup_indexes(prebuilt->table);
-#endif
+ /* We will need to rebuild index translation table. Set
+ valid index entry count in the translation table to zero */
+ share->idx_trans_tbl.index_count = 0;
func_exit:
+ ut_d(dict_table_check_for_dup_indexes(prebuilt->table, FALSE));
trx_commit_for_mysql(trx);
trx_commit_for_mysql(prebuilt->trx);
row_mysql_unlock_data_dictionary(trx);
diff --git a/storage/innobase/handler/i_s.cc b/storage/innobase/handler/i_s.cc
index 7d8b4a8dd40..524fe696de2 100644
--- a/storage/innobase/handler/i_s.cc
+++ b/storage/innobase/handler/i_s.cc
@@ -23,8 +23,8 @@ InnoDB INFORMATION SCHEMA tables interface to MySQL.
Created July 18, 2007 Vasil Dimov
*******************************************************/
+#include <mysql_priv.h>
#include <mysqld_error.h>
-#include <sql_acl.h> // PROCESS_ACL
#include <m_ctype.h>
#include <hash.h>
@@ -32,8 +32,7 @@ Created July 18, 2007 Vasil Dimov
#include <mysys_err.h>
#include <my_sys.h>
#include "i_s.h"
-#include <sql_plugin.h>
-#include <mysql/innodb_priv.h>
+#include <mysql/plugin.h>
extern "C" {
#include "trx0i_s.h"
diff --git a/storage/innobase/handler/mysql_addons.cc b/storage/innobase/handler/mysql_addons.cc
index ae6306e5db9..eae1fe9fbc2 100644
--- a/storage/innobase/handler/mysql_addons.cc
+++ b/storage/innobase/handler/mysql_addons.cc
@@ -36,7 +36,7 @@ Created November 07, 2007 Vasil Dimov
#define MYSQL_SERVER
#endif /* MYSQL_SERVER */
-#include <sql_priv.h>
+#include <mysql_priv.h>
#include "mysql_addons.h"
#include "univ.i"
diff --git a/storage/innobase/ibuf/ibuf0ibuf.c b/storage/innobase/ibuf/ibuf0ibuf.c
index 08986fac0ef..d405d90fe25 100644
--- a/storage/innobase/ibuf/ibuf0ibuf.c
+++ b/storage/innobase/ibuf/ibuf0ibuf.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -89,7 +89,28 @@ is in the compact format. The presence of this marker can be detected by
looking at the length of the field modulo DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE.
The high-order bit of the character set field in the type info is the
-"nullable" flag for the field. */
+"nullable" flag for the field.
+
+In versions >= 5.5:
+
+The optional marker byte at the start of the fourth field is replaced by
+mandatory 3 fields, totaling 4 bytes:
+
+ 1. 2 bytes: Counter field, used to sort records within a (space id, page
+ no) in the order they were added. This is needed so that for example the
+ sequence of operations "INSERT x, DEL MARK x, INSERT x" is handled
+ correctly.
+
+ 2. 1 byte: Operation type (see ibuf_op_t).
+
+ 3. 1 byte: Flags. Currently only one flag exists, IBUF_REC_COMPACT.
+
+To ensure older records, which do not have counters to enforce correct
+sorting, are merged before any new records, ibuf_insert checks if we're
+trying to insert to a position that contains old-style records, and if so,
+refuses the insert. Thus, ibuf pages are gradually converted to the new
+format as their corresponding buffer pool pages are read into memory.
+*/
/* PREVENTING DEADLOCKS IN THE INSERT BUFFER SYSTEM
@@ -168,7 +189,7 @@ access order rules. */
#define IBUF_TABLE_NAME "SYS_IBUF_TABLE"
/** Operations that can currently be buffered. */
-UNIV_INTERN ibuf_use_t ibuf_use = IBUF_USE_INSERT;
+UNIV_INTERN ibuf_use_t ibuf_use = IBUF_USE_ALL;
/** The insert buffer control structure */
UNIV_INTERN ibuf_t* ibuf = NULL;
@@ -176,6 +197,12 @@ UNIV_INTERN ibuf_t* ibuf = NULL;
/** Counter for ibuf_should_try() */
UNIV_INTERN ulint ibuf_flush_count = 0;
+#ifdef UNIV_PFS_MUTEX
+UNIV_INTERN mysql_pfs_key_t ibuf_pessimistic_insert_mutex_key;
+UNIV_INTERN mysql_pfs_key_t ibuf_mutex_key;
+UNIV_INTERN mysql_pfs_key_t ibuf_bitmap_mutex_key;
+#endif /* UNIV_PFS_MUTEX */
+
#ifdef UNIV_IBUF_COUNT_DEBUG
/** Number of tablespaces in the ibuf_counts array */
#define IBUF_COUNT_N_SPACES 4
@@ -221,6 +248,32 @@ ibuf_count_check(
list of the ibuf */
/* @} */
+/* Various constants for checking the type of an ibuf record and extracting
+data from it. For details, see the description of the record format at the
+top of this file. */
+
+/** @name Format of the fourth column of an insert buffer record
+The fourth column in the MySQL 5.5 format contains an operation
+type, counter, and some flags. */
+/* @{ */
+#define IBUF_REC_INFO_SIZE 4 /*!< Combined size of info fields at
+ the beginning of the fourth field */
+#if IBUF_REC_INFO_SIZE >= DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE
+# error "IBUF_REC_INFO_SIZE >= DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE"
+#endif
+
+/* Offsets for the fields at the beginning of the fourth field */
+#define IBUF_REC_OFFSET_COUNTER 0 /*!< Operation counter */
+#define IBUF_REC_OFFSET_TYPE 2 /*!< Type of operation */
+#define IBUF_REC_OFFSET_FLAGS 3 /*!< Additional flags */
+
+/* Record flag masks */
+#define IBUF_REC_COMPACT 0x1 /*!< Set in
+ IBUF_REC_OFFSET_FLAGS if the
+ user index is in COMPACT
+ format or later */
+
+
/** The mutex used to block pessimistic inserts to ibuf trees */
static mutex_t ibuf_pessimistic_insert_mutex;
@@ -461,12 +514,15 @@ ibuf_init_at_db_start(void)
ibuf->max_size = buf_pool_get_curr_size() / UNIV_PAGE_SIZE
/ IBUF_POOL_SIZE_PER_MAX_SIZE;
- mutex_create(&ibuf_pessimistic_insert_mutex,
+ mutex_create(ibuf_pessimistic_insert_mutex_key,
+ &ibuf_pessimistic_insert_mutex,
SYNC_IBUF_PESS_INSERT_MUTEX);
- mutex_create(&ibuf_mutex, SYNC_IBUF_MUTEX);
+ mutex_create(ibuf_mutex_key,
+ &ibuf_mutex, SYNC_IBUF_MUTEX);
- mutex_create(&ibuf_bitmap_mutex, SYNC_IBUF_BITMAP_MUTEX);
+ mutex_create(ibuf_bitmap_mutex_key,
+ &ibuf_bitmap_mutex, SYNC_IBUF_BITMAP_MUTEX);
mtr_start(&mtr);
@@ -730,24 +786,41 @@ page containing the descriptor bits for the file page; the bitmap page
is x-latched */
static
page_t*
-ibuf_bitmap_get_map_page(
-/*=====================*/
- ulint space, /*!< in: space id of the file page */
- ulint page_no,/*!< in: page number of the file page */
- ulint zip_size,/*!< in: compressed page size in bytes;
- 0 for uncompressed pages */
- mtr_t* mtr) /*!< in: mtr */
+ibuf_bitmap_get_map_page_func(
+/*==========================*/
+ ulint space, /*!< in: space id of the file page */
+ ulint page_no,/*!< in: page number of the file page */
+ ulint zip_size,/*!< in: compressed page size in bytes;
+ 0 for uncompressed pages */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
+ mtr_t* mtr) /*!< in: mtr */
{
buf_block_t* block;
- block = buf_page_get(space, zip_size,
- ibuf_bitmap_page_no_calc(zip_size, page_no),
- RW_X_LATCH, mtr);
+ block = buf_page_get_gen(space, zip_size,
+ ibuf_bitmap_page_no_calc(zip_size, page_no),
+ RW_X_LATCH, NULL, BUF_GET,
+ file, line, mtr);
buf_block_dbg_add_level(block, SYNC_IBUF_BITMAP);
return(buf_block_get_frame(block));
}
+/********************************************************************//**
+Gets the ibuf bitmap page where the bits describing a given file page are
+stored.
+@return bitmap page where the file page is mapped, that is, the bitmap
+page containing the descriptor bits for the file page; the bitmap page
+is x-latched
+@param space in: space id of the file page
+@param page_no in: page number of the file page
+@param zip_size in: compressed page size in bytes; 0 for uncompressed pages
+@param mtr in: mini-transaction */
+#define ibuf_bitmap_get_map_page(space, page_no, zip_size, mtr) \
+ ibuf_bitmap_get_map_page_func(space, page_no, zip_size, \
+ __FILE__, __LINE__, mtr)
+
/************************************************************************//**
Sets the free bits of the page in the ibuf bitmap. This is done in a separate
mini-transaction, hence this operation does not restrict further work to only
@@ -1133,9 +1206,185 @@ ibuf_rec_get_space(
return(0);
}
+/****************************************************************//**
+Get various information about an ibuf record in >= 4.1.x format. */
+static
+void
+ibuf_rec_get_info(
+/*==============*/
+ const rec_t* rec, /*!< in: ibuf record */
+ ibuf_op_t* op, /*!< out: operation type, or NULL */
+ ibool* comp, /*!< out: compact flag, or NULL */
+ ulint* info_len, /*!< out: length of info fields at the
+ start of the fourth field, or
+ NULL */
+ ulint* counter) /*!< in: counter value, or NULL */
+{
+ const byte* types;
+ ulint fields;
+ ulint len;
+
+ /* Local variables to shadow arguments. */
+ ibuf_op_t op_local;
+ ibool comp_local;
+ ulint info_len_local;
+ ulint counter_local;
+
+ ut_ad(ibuf_inside());
+ fields = rec_get_n_fields_old(rec);
+ ut_a(fields > 4);
+
+ types = rec_get_nth_field_old(rec, 3, &len);
+
+ info_len_local = len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE;
+
+ switch (info_len_local) {
+ case 0:
+ case 1:
+ op_local = IBUF_OP_INSERT;
+ comp_local = info_len_local;
+ ut_ad(!counter);
+ counter_local = ULINT_UNDEFINED;
+ break;
+
+ case IBUF_REC_INFO_SIZE:
+ op_local = (ibuf_op_t)types[IBUF_REC_OFFSET_TYPE];
+ comp_local = types[IBUF_REC_OFFSET_FLAGS] & IBUF_REC_COMPACT;
+ counter_local = mach_read_from_2(
+ types + IBUF_REC_OFFSET_COUNTER);
+ break;
+
+ default:
+ ut_error;
+ }
+
+ ut_a(op_local < IBUF_OP_COUNT);
+ ut_a((len - info_len_local) ==
+ (fields - 4) * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE);
+
+ if (op) {
+ *op = op_local;
+ }
+
+ if (comp) {
+ *comp = comp_local;
+ }
+
+ if (info_len) {
+ *info_len = info_len_local;
+ }
+
+ if (counter) {
+ *counter = counter_local;
+ }
+}
+
+/****************************************************************//**
+Returns the operation type field of an ibuf record.
+@return operation type */
+static
+ibuf_op_t
+ibuf_rec_get_op_type(
+/*=================*/
+ const rec_t* rec) /*!< in: ibuf record */
+{
+ ulint len;
+ const byte* field;
+
+ ut_ad(ibuf_inside());
+ ut_ad(rec_get_n_fields_old(rec) > 2);
+
+ field = rec_get_nth_field_old(rec, 1, &len);
+
+ if (len > 1) {
+ /* This is a < 4.1.x format record */
+
+ return(IBUF_OP_INSERT);
+ } else {
+ ibuf_op_t op;
+
+ ibuf_rec_get_info(rec, &op, NULL, NULL, NULL);
+
+ return(op);
+ }
+}
+
+/****************************************************************//**
+Read the first two bytes from a record's fourth field (counter field in new
+records; something else in older records).
+@return "counter" field, or ULINT_UNDEFINED if for some reason it
+can't be read */
+UNIV_INTERN
+ulint
+ibuf_rec_get_counter(
+/*=================*/
+ const rec_t* rec) /*!< in: ibuf record */
+{
+ const byte* ptr;
+ ulint len;
+
+ if (rec_get_n_fields_old(rec) < 4) {
+
+ return(ULINT_UNDEFINED);
+ }
+
+ ptr = rec_get_nth_field_old(rec, 3, &len);
+
+ if (len >= 2) {
+
+ return(mach_read_from_2(ptr));
+ } else {
+
+ return(ULINT_UNDEFINED);
+ }
+}
+
+/****************************************************************//**
+Add accumulated operation counts to a permanent array. Both arrays must be
+of size IBUF_OP_COUNT. */
+static
+void
+ibuf_add_ops(
+/*=========*/
+ ulint* arr, /*!< in/out: array to modify */
+ const ulint* ops) /*!< in: operation counts */
+
+{
+ ulint i;
+
+ for (i = 0; i < IBUF_OP_COUNT; i++) {
+ arr[i] += ops[i];
+ }
+}
+
+/****************************************************************//**
+Print operation counts. The array must be of size IBUF_OP_COUNT. */
+static
+void
+ibuf_print_ops(
+/*===========*/
+ const ulint* ops, /*!< in: operation counts */
+ FILE* file) /*!< in: file where to print */
+{
+ static const char* op_names[] = {
+ "insert",
+ "delete mark",
+ "delete"
+ };
+ ulint i;
+
+ ut_a(UT_ARR_SIZE(op_names) == IBUF_OP_COUNT);
+
+ for (i = 0; i < IBUF_OP_COUNT; i++) {
+ fprintf(file, "%s %lu%s", op_names[i],
+ (ulong) ops[i], (i < (IBUF_OP_COUNT - 1)) ? ", " : "");
+ }
+
+ putc('\n', file);
+}
+
/********************************************************************//**
Creates a dummy index for inserting a record to a non-clustered index.
-
@return dummy index */
static
dict_index_t*
@@ -1246,8 +1495,16 @@ ibuf_build_entry_pre_4_1_x(
}
/*********************************************************************//**
-Builds the entry to insert into a non-clustered index when we have the
-corresponding record in an ibuf index.
+Builds the entry used to
+
+1) IBUF_OP_INSERT: insert into a non-clustered index
+
+2) IBUF_OP_DELETE_MARK: find the record whose delete-mark flag we need to
+ activate
+
+3) IBUF_OP_DELETE: find the record we need to delete
+
+when we have the corresponding record in an ibuf index.
NOTE that as we copy pointers to fields in ibuf_rec, the caller must
hold a latch to the ibuf_rec page as long as the entry is used!
@@ -1268,7 +1525,9 @@ ibuf_build_entry_from_ibuf_rec(
const byte* types;
const byte* data;
ulint len;
+ ulint info_len;
ulint i;
+ ulint comp;
dict_index_t* index;
data = rec_get_nth_field_old(ibuf_rec, 1, &len);
@@ -1291,16 +1550,12 @@ ibuf_build_entry_from_ibuf_rec(
types = rec_get_nth_field_old(ibuf_rec, 3, &len);
- ut_a(len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE <= 1);
- index = ibuf_dummy_index_create(
- n_fields, len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE);
+ ibuf_rec_get_info(ibuf_rec, NULL, &comp, &info_len, NULL);
- if (len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE) {
- /* compact record format */
- len--;
- ut_a(*types == 0);
- types++;
- }
+ index = ibuf_dummy_index_create(n_fields, comp);
+
+ len -= info_len;
+ types += info_len;
ut_a(len == n_fields * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE);
@@ -1330,6 +1585,58 @@ ibuf_build_entry_from_ibuf_rec(
return(tuple);
}
+/******************************************************************//**
+Get the data size.
+@return size of fields */
+UNIV_INLINE
+ulint
+ibuf_rec_get_size(
+/*==============*/
+ const rec_t* rec, /*!< in: ibuf record */
+ const byte* types, /*!< in: fields */
+ ulint n_fields, /*!< in: number of fields */
+ ibool pre_4_1, /*!< in: TRUE=pre-4.1 format,
+ FALSE=newer */
+ ulint comp) /*!< in: 0=ROW_FORMAT=REDUNDANT,
+ nonzero=ROW_FORMAT=COMPACT */
+{
+ ulint i;
+ ulint field_offset;
+ ulint types_offset;
+ ulint size = 0;
+
+ if (pre_4_1) {
+ field_offset = 2;
+ types_offset = DATA_ORDER_NULL_TYPE_BUF_SIZE;
+ } else {
+ field_offset = 4;
+ types_offset = DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE;
+ }
+
+ for (i = 0; i < n_fields; i++) {
+ ulint len;
+ dtype_t dtype;
+
+ rec_get_nth_field_offs_old(rec, i + field_offset, &len);
+
+ if (len != UNIV_SQL_NULL) {
+ size += len;
+ } else if (pre_4_1) {
+ dtype_read_for_order_and_null_size(&dtype, types);
+
+ size += dtype_get_sql_null_size(&dtype, comp);
+ } else {
+ dtype_new_read_for_order_and_null_size(&dtype, types);
+
+ size += dtype_get_sql_null_size(&dtype, comp);
+ }
+
+ types += types_offset;
+ }
+
+ return(size);
+}
+
/********************************************************************//**
Returns the space taken by a stored non-clustered index entry if converted to
an index record.
@@ -1341,22 +1648,21 @@ ibuf_rec_get_volume(
/*================*/
const rec_t* ibuf_rec)/*!< in: ibuf record */
{
- dtype_t dtype;
- ibool new_format = FALSE;
- ulint data_size = 0;
- ulint n_fields;
- const byte* types;
- const byte* data;
ulint len;
- ulint i;
+ const byte* data;
+ const byte* types;
+ ulint n_fields;
+ ulint data_size;
+ ibool pre_4_1;
ulint comp;
ut_ad(ibuf_inside());
ut_ad(rec_get_n_fields_old(ibuf_rec) > 2);
data = rec_get_nth_field_old(ibuf_rec, 1, &len);
+ pre_4_1 = (len > 1);
- if (len > 1) {
+ if (pre_4_1) {
/* < 4.1.x format record */
ut_a(trx_doublewrite_must_reset_space_ids);
@@ -1370,54 +1676,46 @@ ibuf_rec_get_volume(
comp = 0;
} else {
/* >= 4.1.x format record */
+ ibuf_op_t op;
+ ulint info_len;
ut_a(trx_sys_multiple_tablespace_format);
ut_a(*data == 0);
types = rec_get_nth_field_old(ibuf_rec, 3, &len);
- comp = len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE;
+ ibuf_rec_get_info(ibuf_rec, &op, &comp, &info_len, NULL);
+
+ if (op == IBUF_OP_DELETE_MARK || op == IBUF_OP_DELETE) {
+ /* Delete-marking a record doesn't take any
+ additional space, and while deleting a record
+ actually frees up space, we have to play it safe and
+ pretend it takes no additional space (the record
+ might not exist, etc.). */
- ut_a(comp <= 1);
- if (comp) {
- /* compact record format */
+ return(0);
+ } else if (comp) {
+ dtuple_t* entry;
ulint volume;
dict_index_t* dummy_index;
mem_heap_t* heap = mem_heap_create(500);
- dtuple_t* entry = ibuf_build_entry_from_ibuf_rec(
+
+ entry = ibuf_build_entry_from_ibuf_rec(
ibuf_rec, heap, &dummy_index);
+
volume = rec_get_converted_size(dummy_index, entry, 0);
+
ibuf_dummy_index_free(dummy_index);
mem_heap_free(heap);
+
return(volume + page_dir_calc_reserved_space(1));
}
+ types += info_len;
n_fields = rec_get_n_fields_old(ibuf_rec) - 4;
-
- new_format = TRUE;
}
- for (i = 0; i < n_fields; i++) {
- if (new_format) {
- data = rec_get_nth_field_old(ibuf_rec, i + 4, &len);
-
- dtype_new_read_for_order_and_null_size(
- &dtype, types + i
- * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE);
- } else {
- data = rec_get_nth_field_old(ibuf_rec, i + 2, &len);
-
- dtype_read_for_order_and_null_size(
- &dtype, types + i
- * DATA_ORDER_NULL_TYPE_BUF_SIZE);
- }
-
- if (len == UNIV_SQL_NULL) {
- data_size += dtype_get_sql_null_size(&dtype, comp);
- } else {
- data_size += len;
- }
- }
+ data_size = ibuf_rec_get_size(ibuf_rec, types, n_fields, pre_4_1, comp);
return(data_size + rec_get_converted_extra_size(data_size, n_fields, 0)
+ page_dir_calc_reserved_space(1));
@@ -1435,11 +1733,14 @@ static
dtuple_t*
ibuf_entry_build(
/*=============*/
+ ibuf_op_t op, /*!< in: operation type */
dict_index_t* index, /*!< in: non-clustered index */
const dtuple_t* entry, /*!< in: entry for a non-clustered index */
ulint space, /*!< in: space id */
ulint page_no,/*!< in: index page number where entry should
be inserted */
+ ulint counter,/*!< in: counter value;
+ ULINT_UNDEFINED=not used */
mem_heap_t* heap) /*!< in: heap into which to build */
{
dtuple_t* tuple;
@@ -1447,28 +1748,28 @@ ibuf_entry_build(
const dfield_t* entry_field;
ulint n_fields;
byte* buf;
- byte* buf2;
+ byte* ti;
+ byte* type_info;
ulint i;
- /* Starting from 4.1.x, we have to build a tuple whose
- (1) first field is the space id,
- (2) the second field a single marker byte (0) to tell that this
- is a new format record,
- (3) the third contains the page number, and
- (4) the fourth contains the relevent type information of each data
- field; the length of this field % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE is
- (a) 0 for b-trees in the old format, and
- (b) 1 for b-trees in the compact format, the first byte of the field
- being the marker (0);
- (5) and the rest of the fields are copied from entry. All fields
- in the tuple are ordered like the type binary in our insert buffer
- tree. */
+ ut_ad(counter != ULINT_UNDEFINED || op == IBUF_OP_INSERT);
+ ut_ad(counter == ULINT_UNDEFINED || counter <= 0xFFFF);
+ ut_ad(op < IBUF_OP_COUNT);
+
+ /* We have to build a tuple with the following fields:
+
+ 1-4) These are described at the top of this file.
+
+ 5) The rest of the fields are copied from the entry.
+
+ All fields in the tuple are ordered like the type binary in our
+ insert buffer tree. */
n_fields = dtuple_get_n_fields(entry);
tuple = dtuple_create(heap, n_fields + 4);
- /* Store the space id in tuple */
+ /* 1) Space Id */
field = dtuple_get_nth_field(tuple, 0);
@@ -1478,7 +1779,7 @@ ibuf_entry_build(
dfield_set_data(field, buf, 4);
- /* Store the marker byte field in tuple */
+ /* 2) Marker byte */
field = dtuple_get_nth_field(tuple, 1);
@@ -1490,7 +1791,7 @@ ibuf_entry_build(
dfield_set_data(field, buf, 1);
- /* Store the page number in tuple */
+ /* 3) Page number */
field = dtuple_get_nth_field(tuple, 2);
@@ -1500,14 +1801,42 @@ ibuf_entry_build(
dfield_set_data(field, buf, 4);
- /* Store the type info in buf2, and add the fields from entry to
- tuple */
- buf2 = mem_heap_alloc(heap, n_fields
- * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE
- + dict_table_is_comp(index->table));
- if (dict_table_is_comp(index->table)) {
- *buf2++ = 0; /* write the compact format indicator */
+ /* 4) Type info, part #1 */
+
+ if (counter == ULINT_UNDEFINED) {
+ i = dict_table_is_comp(index->table) ? 1 : 0;
+ } else {
+ ut_ad(counter <= 0xFFFF);
+ i = IBUF_REC_INFO_SIZE;
+ }
+
+ ti = type_info = mem_heap_alloc(heap, i + n_fields
+ * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE);
+
+ switch (i) {
+ default:
+ ut_error;
+ break;
+ case 1:
+ /* set the flag for ROW_FORMAT=COMPACT */
+ *ti++ = 0;
+ /* fall through */
+ case 0:
+ /* the old format does not allow delete buffering */
+ ut_ad(op == IBUF_OP_INSERT);
+ break;
+ case IBUF_REC_INFO_SIZE:
+ mach_write_to_2(ti + IBUF_REC_OFFSET_COUNTER, counter);
+
+ ti[IBUF_REC_OFFSET_TYPE] = (byte) op;
+ ti[IBUF_REC_OFFSET_FLAGS] = dict_table_is_comp(index->table)
+ ? IBUF_REC_COMPACT : 0;
+ ti += IBUF_REC_INFO_SIZE;
+ break;
}
+
+ /* 5+) Fields from the entry */
+
for (i = 0; i < n_fields; i++) {
ulint fixed_len;
const dict_field_t* ifield;
@@ -1542,21 +1871,16 @@ ibuf_entry_build(
#endif /* UNIV_DEBUG */
dtype_new_store_for_order_and_null_size(
- buf2 + i * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE,
- dfield_get_type(entry_field), fixed_len);
+ ti, dfield_get_type(entry_field), fixed_len);
+ ti += DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE;
}
- /* Store the type info in buf2 to field 3 of tuple */
+ /* 4) Type info, part #2 */
field = dtuple_get_nth_field(tuple, 3);
- if (dict_table_is_comp(index->table)) {
- buf2--;
- }
+ dfield_set_data(field, type_info, ti - type_info);
- dfield_set_data(field, buf2, n_fields
- * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE
- + dict_table_is_comp(index->table));
/* Set all the types in the new tuple binary */
dtuple_set_types_binary(tuple, n_fields + 4);
@@ -2165,6 +2489,21 @@ ibuf_contract_ext(
ibuf_is_empty:
mutex_exit(&ibuf_mutex);
+#if 0 /* TODO */
+ if (srv_shutdown_state) {
+ /* If the insert buffer becomes empty during
+ shutdown, note it in the system tablespace. */
+
+ trx_sys_set_ibuf_format(TRX_SYS_IBUF_EMPTY);
+ }
+
+ /* TO DO: call trx_sys_set_ibuf_format() at startup
+ and whenever ibuf_use is changed to allow buffered
+ delete-marking or deleting. Never downgrade the
+ stamped format except when the insert buffer becomes
+ empty. */
+#endif
+
return(0);
}
@@ -2177,6 +2516,8 @@ ibuf_is_empty:
btr_pcur_open_at_rnd_pos(ibuf->index, BTR_SEARCH_LEAF, &pcur, &mtr);
+ ut_ad(page_validate(btr_pcur_get_page(&pcur), ibuf->index));
+
if (page_get_n_recs(btr_pcur_get_page(&pcur)) == 0) {
/* When the ibuf tree is emptied completely, the last record
is removed using an optimistic delete and ibuf_size_update
@@ -2310,6 +2651,160 @@ ibuf_contract_after_insert(
}
/*********************************************************************//**
+Determine if an insert buffer record has been encountered already.
+@return TRUE if a new record, FALSE if possible duplicate */
+static
+ibool
+ibuf_get_volume_buffered_hash(
+/*==========================*/
+ const rec_t* rec, /*!< in: ibuf record in post-4.1 format */
+ const byte* types, /*!< in: fields */
+ const byte* data, /*!< in: start of user record data */
+ ulint comp, /*!< in: 0=ROW_FORMAT=REDUNDANT,
+ nonzero=ROW_FORMAT=COMPACT */
+ ulint* hash, /*!< in/out: hash array */
+ ulint size) /*!< in: number of elements in hash array */
+{
+ ulint len;
+ ulint fold;
+ ulint bitmask;
+
+ len = ibuf_rec_get_size(rec, types, rec_get_n_fields_old(rec) - 4,
+ FALSE, comp);
+ fold = ut_fold_binary(data, len);
+
+ hash += (fold / (CHAR_BIT * sizeof *hash)) % size;
+ bitmask = 1 << (fold % (CHAR_BIT * sizeof *hash));
+
+ if (*hash & bitmask) {
+
+ return(FALSE);
+ }
+
+ /* We have not seen this record yet. Insert it. */
+ *hash |= bitmask;
+
+ return(TRUE);
+}
+
+/*********************************************************************//**
+Update the estimate of the number of records on a page, and
+get the space taken by merging the buffered record to the index page.
+@return size of index record in bytes + an upper limit of the space
+taken in the page directory */
+static
+ulint
+ibuf_get_volume_buffered_count(
+/*===========================*/
+ const rec_t* rec, /*!< in: insert buffer record */
+ ulint* hash, /*!< in/out: hash array */
+ ulint size, /*!< in: number of elements in hash array */
+ lint* n_recs) /*!< in/out: estimated number of records
+ on the page that rec points to */
+{
+ ulint len;
+ ibuf_op_t ibuf_op;
+ const byte* types;
+ ulint n_fields = rec_get_n_fields_old(rec);
+
+ ut_ad(ibuf_inside());
+ ut_ad(n_fields > 4);
+ n_fields -= 4;
+
+ rec_get_nth_field_offs_old(rec, 1, &len);
+ /* This function is only invoked when buffering new
+ operations. All pre-4.1 records should have been merged
+ when the database was started up. */
+ ut_a(len == 1);
+ ut_ad(trx_sys_multiple_tablespace_format);
+
+ types = rec_get_nth_field_old(rec, 3, &len);
+
+ switch (UNIV_EXPECT(len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE,
+ IBUF_REC_INFO_SIZE)) {
+ default:
+ ut_error;
+ case 0:
+ /* This ROW_TYPE=REDUNDANT record does not include an
+ operation counter. Exclude it from the *n_recs,
+ because deletes cannot be buffered if there are
+ old-style inserts buffered for the page. */
+
+ len = ibuf_rec_get_size(rec, types, n_fields, FALSE, 0);
+
+ return(len
+ + rec_get_converted_extra_size(len, n_fields, 0)
+ + page_dir_calc_reserved_space(1));
+ case 1:
+ /* This ROW_TYPE=COMPACT record does not include an
+ operation counter. Exclude it from the *n_recs,
+ because deletes cannot be buffered if there are
+ old-style inserts buffered for the page. */
+ goto get_volume_comp;
+
+ case IBUF_REC_INFO_SIZE:
+ ibuf_op = (ibuf_op_t) types[IBUF_REC_OFFSET_TYPE];
+ break;
+ }
+
+ switch (ibuf_op) {
+ case IBUF_OP_INSERT:
+ /* Inserts can be done by
+ btr_cur_set_deleted_flag_for_ibuf(). Because
+ delete-mark and insert operations can be pointing to
+ the same records, we must not count duplicates. */
+ case IBUF_OP_DELETE_MARK:
+ /* There must be a record to delete-mark.
+ See if this record has been already buffered. */
+ if (n_recs && ibuf_get_volume_buffered_hash(
+ rec, types + IBUF_REC_INFO_SIZE,
+ types + len,
+ types[IBUF_REC_OFFSET_FLAGS] & IBUF_REC_COMPACT,
+ hash, size)) {
+ (*n_recs)++;
+ }
+
+ if (ibuf_op == IBUF_OP_DELETE_MARK) {
+ /* Setting the delete-mark flag does not
+ affect the available space on the page. */
+ return(0);
+ }
+ break;
+ case IBUF_OP_DELETE:
+ /* A record will be removed from the page. */
+ if (n_recs) {
+ (*n_recs)--;
+ }
+ /* While deleting a record actually frees up space,
+ we have to play it safe and pretend that it takes no
+ additional space (the record might not exist, etc.). */
+ return(0);
+ default:
+ ut_error;
+ }
+
+ ut_ad(ibuf_op == IBUF_OP_INSERT);
+
+get_volume_comp:
+ {
+ dtuple_t* entry;
+ ulint volume;
+ dict_index_t* dummy_index;
+ mem_heap_t* heap = mem_heap_create(500);
+
+ entry = ibuf_build_entry_from_ibuf_rec(
+ rec, heap, &dummy_index);
+
+ volume = rec_get_converted_size(dummy_index, entry, 0);
+
+ ibuf_dummy_index_free(dummy_index);
+ mem_heap_free(heap);
+
+ return(volume + page_dir_calc_reserved_space(1));
+ }
+}
+
+/*********************************************************************//**
Gets an upper limit for the combined size of entries buffered in the insert
buffer for a given page.
@return upper limit for the volume of buffered inserts for the index
@@ -2326,6 +2821,9 @@ ibuf_get_volume_buffered(
or BTR_MODIFY_TREE */
ulint space, /*!< in: space id */
ulint page_no,/*!< in: page number of an index page */
+ lint* n_recs, /*!< in/out: minimum number of records on the
+ page after the buffered changes have been
+ applied, or NULL to disable the counting */
mtr_t* mtr) /*!< in: mtr */
{
ulint volume;
@@ -2335,19 +2833,25 @@ ibuf_get_volume_buffered(
page_t* prev_page;
ulint next_page_no;
page_t* next_page;
+ ulint hash_bitmap[128 / sizeof(ulint)]; /* bitmap of buffered recs */
ut_a(trx_sys_multiple_tablespace_format);
ut_ad((pcur->latch_mode == BTR_MODIFY_PREV)
|| (pcur->latch_mode == BTR_MODIFY_TREE));
- /* Count the volume of records earlier in the alphabetical order than
+ /* Count the volume of inserts earlier in the alphabetical order than
pcur */
volume = 0;
+ if (n_recs) {
+ memset(hash_bitmap, 0, sizeof hash_bitmap);
+ }
+
rec = btr_pcur_get_rec(pcur);
page = page_align(rec);
+ ut_ad(page_validate(page, ibuf->index));
if (page_rec_is_supremum(rec)) {
rec = page_rec_get_prev(rec);
@@ -2365,9 +2869,11 @@ ibuf_get_volume_buffered(
goto count_later;
}
- volume += ibuf_rec_get_volume(rec);
+ volume += ibuf_get_volume_buffered_count(
+ rec, hash_bitmap, UT_ARR_SIZE(hash_bitmap), n_recs);
rec = page_rec_get_prev(rec);
+ ut_ad(page_align(rec) == page);
}
/* Look at the previous page */
@@ -2389,6 +2895,7 @@ ibuf_get_volume_buffered(
prev_page = buf_block_get_frame(block);
+ ut_ad(page_validate(prev_page, ibuf->index));
}
#ifdef UNIV_BTR_DEBUG
@@ -2415,9 +2922,11 @@ ibuf_get_volume_buffered(
goto count_later;
}
- volume += ibuf_rec_get_volume(rec);
+ volume += ibuf_get_volume_buffered_count(
+ rec, hash_bitmap, UT_ARR_SIZE(hash_bitmap), n_recs);
rec = page_rec_get_prev(rec);
+ ut_ad(page_align(rec) == prev_page);
}
count_later:
@@ -2439,7 +2948,8 @@ count_later:
return(volume);
}
- volume += ibuf_rec_get_volume(rec);
+ volume += ibuf_get_volume_buffered_count(
+ rec, hash_bitmap, UT_ARR_SIZE(hash_bitmap), n_recs);
rec = page_rec_get_next(rec);
}
@@ -2463,6 +2973,7 @@ count_later:
next_page = buf_block_get_frame(block);
+ ut_ad(page_validate(next_page, ibuf->index));
}
#ifdef UNIV_BTR_DEBUG
@@ -2486,9 +2997,11 @@ count_later:
return(volume);
}
- volume += ibuf_rec_get_volume(rec);
+ volume += ibuf_get_volume_buffered_count(
+ rec, hash_bitmap, UT_ARR_SIZE(hash_bitmap), n_recs);
rec = page_rec_get_next(rec);
+ ut_ad(page_align(rec) == next_page);
}
}
@@ -2516,6 +3029,8 @@ ibuf_update_max_tablespace_id(void)
btr_pcur_open_at_index_side(
FALSE, ibuf->index, BTR_SEARCH_LEAF, &pcur, TRUE, &mtr);
+ ut_ad(page_validate(btr_pcur_get_page(&pcur), ibuf->index));
+
btr_pcur_move_to_prev(&pcur, &mtr);
if (btr_pcur_is_before_first_on_page(&pcur)) {
@@ -2540,15 +3055,226 @@ ibuf_update_max_tablespace_id(void)
fil_set_max_space_id_if_bigger(max_space_id);
}
+/****************************************************************//**
+Helper function for ibuf_set_entry_counter. Checks if rec is for (space,
+page_no), and if so, reads counter value from it and returns that + 1.
+Otherwise, returns 0.
+@return new counter value, or 0 */
+static
+ulint
+ibuf_get_entry_counter_low(
+/*=======================*/
+ const rec_t* rec, /*!< in: insert buffer record */
+ ulint space, /*!< in: space id */
+ ulint page_no) /*!< in: page number */
+{
+ ulint counter;
+ const byte* field;
+ ulint len;
+
+ ut_ad(ibuf_inside());
+ ut_ad(rec_get_n_fields_old(rec) > 2);
+
+ field = rec_get_nth_field_old(rec, 1, &len);
+
+ if (UNIV_UNLIKELY(len != 1)) {
+ /* pre-4.1 format */
+ ut_a(trx_doublewrite_must_reset_space_ids);
+ ut_a(!trx_sys_multiple_tablespace_format);
+
+ return(ULINT_UNDEFINED);
+ }
+
+ ut_a(trx_sys_multiple_tablespace_format);
+
+ /* Check the tablespace identifier. */
+ field = rec_get_nth_field_old(rec, 0, &len);
+ ut_a(len == 4);
+
+ if (mach_read_from_4(field) != space) {
+
+ return(0);
+ }
+
+ /* Check the page offset. */
+ field = rec_get_nth_field_old(rec, 2, &len);
+ ut_a(len == 4);
+
+ if (mach_read_from_4(field) != page_no) {
+
+ return(0);
+ }
+
+ /* Check if the record contains a counter field. */
+ field = rec_get_nth_field_old(rec, 3, &len);
+
+ switch (len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE) {
+ default:
+ ut_error;
+ case 0: /* ROW_FORMAT=REDUNDANT */
+ case 1: /* ROW_FORMAT=COMPACT */
+ return(ULINT_UNDEFINED);
+
+ case IBUF_REC_INFO_SIZE:
+ counter = mach_read_from_2(field + IBUF_REC_OFFSET_COUNTER);
+ ut_a(counter < 0xFFFF);
+ return(counter + 1);
+ }
+}
+
+/****************************************************************//**
+Set the counter field in entry to the correct value based on the current
+last record in ibuf for (space, page_no).
+@return FALSE if we should abort this insertion to ibuf */
+static
+ibool
+ibuf_set_entry_counter(
+/*===================*/
+ dtuple_t* entry, /*!< in/out: entry to patch */
+ ulint space, /*!< in: space id of entry */
+ ulint page_no, /*!< in: page number of entry */
+ btr_pcur_t* pcur, /*!< in: pcur positioned on the record
+ found by btr_pcur_open(.., entry,
+ PAGE_CUR_LE, ..., pcur, ...) */
+ ibool is_optimistic, /*!< in: is this an optimistic insert */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ulint counter;
+ dfield_t* field;
+ byte* data;
+
+ /* pcur points to either a user rec or to a page's infimum record. */
+ ut_ad(page_validate(btr_pcur_get_page(pcur), ibuf->index));
+
+ if (btr_pcur_is_on_user_rec(pcur)) {
+
+ counter = ibuf_get_entry_counter_low(
+ btr_pcur_get_rec(pcur), space, page_no);
+
+ if (UNIV_UNLIKELY(counter == ULINT_UNDEFINED)) {
+ /* The record lacks a counter field.
+ Such old records must be merged before
+ new records can be buffered. */
+
+ return(FALSE);
+ }
+ } else if (btr_pcur_is_before_first_in_tree(pcur, mtr)) {
+ /* Ibuf tree is either completely empty, or the insert
+ position is at the very first record of a non-empty tree. In
+ either case we have no previous records for (space,
+ page_no). */
+
+ counter = 0;
+ } else if (btr_pcur_is_before_first_on_page(pcur)) {
+ btr_cur_t* cursor = btr_pcur_get_btr_cur(pcur);
+
+ if (cursor->low_match < 3) {
+ /* If low_match < 3, we know that the father node
+ pointer did not contain the searched for (space,
+ page_no), which means that the search ended on the
+ right page regardless of the counter value, and
+ since we're at the infimum record, there are no
+ existing records. */
+
+ counter = 0;
+ } else {
+ rec_t* rec;
+ const page_t* page;
+ buf_block_t* block;
+ page_t* prev_page;
+ ulint prev_page_no;
+
+ ut_a(cursor->ibuf_cnt != ULINT_UNDEFINED);
+
+ page = btr_pcur_get_page(pcur);
+ prev_page_no = btr_page_get_prev(page, mtr);
+
+ ut_a(prev_page_no != FIL_NULL);
+
+ block = buf_page_get(
+ IBUF_SPACE_ID, 0, prev_page_no,
+ RW_X_LATCH, mtr);
+
+ buf_block_dbg_add_level(block, SYNC_TREE_NODE);
+
+ prev_page = buf_block_get_frame(block);
+
+ rec = page_rec_get_prev(
+ page_get_supremum_rec(prev_page));
+
+ ut_ad(page_rec_is_user_rec(rec));
+
+ counter = ibuf_get_entry_counter_low(
+ rec, space, page_no);
+
+ if (UNIV_UNLIKELY(counter == ULINT_UNDEFINED)) {
+ /* The record lacks a counter field.
+ Such old records must be merged before
+ new records can be buffered. */
+
+ return(FALSE);
+ }
+
+ if (counter < cursor->ibuf_cnt) {
+ /* Search ended on the wrong page. */
+
+ if (is_optimistic) {
+ /* In an optimistic insert, we can
+ shift the insert position to the left
+ page, since it only needs an X-latch
+ on the page itself, which the
+ original search acquired for us. */
+
+ btr_cur_position(
+ ibuf->index, rec, block,
+ btr_pcur_get_btr_cur(pcur));
+ } else {
+ /* We can't shift the insert
+ position to the left page in a
+ pessimistic insert since it would
+ require an X-latch on the left
+ page's left page, so we have to
+ abort. */
+
+ return(FALSE);
+ }
+ } else {
+ /* The counter field in the father node is
+ the same as we would insert; we don't know
+ whether the insert should go to this page or
+ the left page (the later fields can differ),
+ so refuse the insert. */
+
+ return(FALSE);
+ }
+ }
+ } else {
+ /* The cursor is not positioned at or before a user record. */
+ return(FALSE);
+ }
+
+ /* Patch counter value in already built entry. */
+ field = dtuple_get_nth_field(entry, 3);
+ data = dfield_get_data(field);
+
+ mach_write_to_2(data + IBUF_REC_OFFSET_COUNTER, counter);
+
+ return(TRUE);
+}
+
/*********************************************************************//**
-Makes an index insert to the insert buffer, instead of directly to the disk
-page, if this is possible.
+Buffer an operation in the insert/delete buffer, instead of doing it
+directly to the disk page, if this is possible.
@return DB_SUCCESS, DB_FAIL, DB_STRONG_FAIL */
static
ulint
ibuf_insert_low(
/*============*/
ulint mode, /*!< in: BTR_MODIFY_PREV or BTR_MODIFY_TREE */
+ ibuf_op_t op, /*!< in: operation type */
+ ibool no_counter,
+ /*!< in: TRUE=use 5.0.3 format;
+ FALSE=allow delete buffering */
const dtuple_t* entry, /*!< in: index entry to insert */
ulint entry_size,
/*!< in: rec_get_converted_size(index, entry) */
@@ -2565,6 +3291,7 @@ ibuf_insert_low(
dtuple_t* ibuf_entry;
mem_heap_t* heap;
ulint buffered;
+ lint min_n_recs;
rec_t* ins_rec;
ibool old_bit_value;
page_t* bitmap_page;
@@ -2575,13 +3302,14 @@ ibuf_insert_low(
ib_int64_t space_versions[IBUF_MAX_N_PAGES_MERGED];
ulint page_nos[IBUF_MAX_N_PAGES_MERGED];
ulint n_stored;
- ulint bits;
mtr_t mtr;
mtr_t bitmap_mtr;
ut_a(!dict_index_is_clust(index));
ut_ad(dtuple_check_typed(entry));
ut_ad(ut_is_2pow(zip_size));
+ ut_ad(!no_counter || op == IBUF_OP_INSERT);
+ ut_a(op < IBUF_OP_COUNT);
ut_a(trx_sys_multiple_tablespace_format);
@@ -2640,11 +3368,17 @@ ibuf_insert_low(
heap = mem_heap_create(512);
- /* Build the entry which contains the space id and the page number as
- the first fields and the type information for other fields, and which
- will be inserted to the insert buffer. */
+ /* Build the entry which contains the space id and the page number
+ as the first fields and the type information for other fields, and
+ which will be inserted to the insert buffer. Using a counter value
+ of 0xFFFF we find the last record for (space, page_no), from which
+ we can then read the counter value N and use N + 1 in the record we
+ insert. (We patch the ibuf_entry's counter field to the correct
+ value just before actually inserting the entry.) */
- ibuf_entry = ibuf_entry_build(index, entry, space, page_no, heap);
+ ibuf_entry = ibuf_entry_build(
+ op, index, entry, space, page_no,
+ no_counter ? ULINT_UNDEFINED : 0xFFFF, heap);
/* Open a cursor to the insert buffer tree to calculate if we can add
the new entry to it without exceeding the free space limit for the
@@ -2653,10 +3387,49 @@ ibuf_insert_low(
mtr_start(&mtr);
btr_pcur_open(ibuf->index, ibuf_entry, PAGE_CUR_LE, mode, &pcur, &mtr);
+ ut_ad(page_validate(btr_pcur_get_page(&pcur), ibuf->index));
/* Find out the volume of already buffered inserts for the same index
page */
- buffered = ibuf_get_volume_buffered(&pcur, space, page_no, &mtr);
+ min_n_recs = 0;
+ buffered = ibuf_get_volume_buffered(&pcur, space, page_no,
+ op == IBUF_OP_DELETE
+ ? &min_n_recs
+ : NULL, &mtr);
+
+ if (op == IBUF_OP_DELETE
+ && (min_n_recs < 2
+ || buf_pool_watch_occurred(space, page_no))) {
+ /* The page could become empty after the record is
+ deleted, or the page has been read in to the buffer
+ pool. Refuse to buffer the operation. */
+
+ /* The buffer pool watch is needed for IBUF_OP_DELETE
+ because of latching order considerations. We can
+ check buf_pool_watch_occurred() only after latching
+ the insert buffer B-tree pages that contain buffered
+ changes for the page. We never buffer IBUF_OP_DELETE,
+ unless some IBUF_OP_INSERT or IBUF_OP_DELETE_MARK have
+ been previously buffered for the page. Because there
+ are buffered operations for the page, the insert
+ buffer B-tree page latches held by mtr will guarantee
+ that no changes for the user page will be merged
+ before mtr_commit(&mtr). We must not mtr_commit(&mtr)
+ until after the IBUF_OP_DELETE has been buffered. */
+
+ err = DB_STRONG_FAIL;
+
+ goto function_exit;
+ }
+
+ /* After this point, the page could still be loaded to the
+ buffer pool, but we do not have to care about it, since we are
+ holding a latch on the insert buffer leaf page that contains
+ buffered changes for (space, page_no). If the page enters the
+ buffer pool, buf_page_io_complete() for (space, page_no) will
+ have to acquire a latch on the same insert buffer leaf page,
+ which it cannot do until we have buffered the IBUF_OP_DELETE
+ and done mtr_commit(&mtr) to release the latch. */
#ifdef UNIV_IBUF_COUNT_DEBUG
ut_a((buffered == 0) || ibuf_count_get(space, page_no));
@@ -2670,28 +3443,45 @@ ibuf_insert_low(
if (buf_page_peek(space, page_no)
|| lock_rec_expl_exist_on_page(space, page_no)) {
- err = DB_STRONG_FAIL;
-
- mtr_commit(&bitmap_mtr);
- goto function_exit;
+ goto bitmap_fail;
}
- bits = ibuf_bitmap_page_get_bits(bitmap_page, page_no, zip_size,
- IBUF_BITMAP_FREE, &bitmap_mtr);
+ if (op == IBUF_OP_INSERT) {
+ ulint bits = ibuf_bitmap_page_get_bits(
+ bitmap_page, page_no, zip_size, IBUF_BITMAP_FREE,
+ &bitmap_mtr);
- if (buffered + entry_size + page_dir_calc_reserved_space(1)
- > ibuf_index_page_calc_free_from_bits(zip_size, bits)) {
- mtr_commit(&bitmap_mtr);
+ if (buffered + entry_size + page_dir_calc_reserved_space(1)
+ > ibuf_index_page_calc_free_from_bits(zip_size, bits)) {
+ /* Release the bitmap page latch early. */
+ mtr_commit(&bitmap_mtr);
+
+ /* It may not fit */
+ do_merge = TRUE;
+
+ ibuf_get_merge_page_nos(
+ FALSE, btr_pcur_get_rec(&pcur),
+ space_ids, space_versions,
+ page_nos, &n_stored);
- /* It may not fit */
+ err = DB_STRONG_FAIL;
+
+ goto function_exit;
+ }
+ }
+
+ /* Patch correct counter value to the entry to insert. This can
+ change the insert position, which can result in the need to abort in
+ some cases. */
+ if (!no_counter
+ && !ibuf_set_entry_counter(ibuf_entry, space, page_no, &pcur,
+ mode == BTR_MODIFY_PREV, &mtr)) {
+bitmap_fail:
err = DB_STRONG_FAIL;
- do_merge = TRUE;
+ mtr_commit(&bitmap_mtr);
- ibuf_get_merge_page_nos(FALSE, btr_pcur_get_rec(&pcur),
- space_ids, space_versions,
- page_nos, &n_stored);
goto function_exit;
}
@@ -2716,7 +3506,7 @@ ibuf_insert_low(
err = btr_cur_optimistic_insert(BTR_NO_LOCKING_FLAG, cursor,
ibuf_entry, &ins_rec,
&dummy_big_rec, 0, thr, &mtr);
- if (err == DB_SUCCESS) {
+ if (err == DB_SUCCESS && op != IBUF_OP_DELETE) {
/* Update the page max trx id field */
page_update_max_trx_id(btr_cur_get_block(cursor), NULL,
thr_get_trx(thr)->id, &mtr);
@@ -2736,7 +3526,7 @@ ibuf_insert_low(
cursor,
ibuf_entry, &ins_rec,
&dummy_big_rec, 0, thr, &mtr);
- if (err == DB_SUCCESS) {
+ if (err == DB_SUCCESS && op != IBUF_OP_DELETE) {
/* Update the page max trx id field */
page_update_max_trx_id(btr_cur_get_block(cursor), NULL,
thr_get_trx(thr)->id, &mtr);
@@ -2773,7 +3563,6 @@ function_exit:
mutex_enter(&ibuf_mutex);
ibuf->empty = FALSE;
- ibuf->n_inserts++;
mutex_exit(&ibuf_mutex);
@@ -2794,14 +3583,15 @@ function_exit:
}
/*********************************************************************//**
-Makes an index insert to the insert buffer, instead of directly to the disk
-page, if this is possible. Does not do insert if the index is clustered
-or unique.
+Buffer an operation in the insert/delete buffer, instead of doing it
+directly to the disk page, if this is possible. Does not do it if the index
+is clustered or unique.
@return TRUE if success */
UNIV_INTERN
ibool
ibuf_insert(
/*========*/
+ ibuf_op_t op, /*!< in: operation type */
const dtuple_t* entry, /*!< in: index entry to insert */
dict_index_t* index, /*!< in: index where to insert */
ulint space, /*!< in: space id where to insert */
@@ -2809,8 +3599,12 @@ ibuf_insert(
ulint page_no,/*!< in: page number where to insert */
que_thr_t* thr) /*!< in: query thread */
{
- ulint err;
- ulint entry_size;
+ ulint err;
+ ulint entry_size;
+ ibool no_counter;
+ /* Read the settable global variable ibuf_use only once in
+ this function, so that we will have a consistent view of it. */
+ ibuf_use_t use = ibuf_use;
ut_a(trx_sys_multiple_tablespace_format);
ut_ad(dtuple_check_typed(entry));
@@ -2818,30 +3612,108 @@ ibuf_insert(
ut_a(!dict_index_is_clust(index));
- switch (UNIV_EXPECT(ibuf_use, IBUF_USE_INSERT)) {
- case IBUF_USE_NONE:
- return(FALSE);
- case IBUF_USE_INSERT:
- goto do_insert;
- case IBUF_USE_COUNT:
+ no_counter = use <= IBUF_USE_INSERT;
+
+ switch (op) {
+ case IBUF_OP_INSERT:
+ switch (use) {
+ case IBUF_USE_NONE:
+ case IBUF_USE_DELETE:
+ case IBUF_USE_DELETE_MARK:
+ return(FALSE);
+ case IBUF_USE_INSERT:
+ case IBUF_USE_INSERT_DELETE_MARK:
+ case IBUF_USE_ALL:
+ goto check_watch;
+ case IBUF_USE_COUNT:
+ break;
+ }
+ break;
+ case IBUF_OP_DELETE_MARK:
+ switch (use) {
+ case IBUF_USE_NONE:
+ case IBUF_USE_INSERT:
+ return(FALSE);
+ case IBUF_USE_DELETE_MARK:
+ case IBUF_USE_DELETE:
+ case IBUF_USE_INSERT_DELETE_MARK:
+ case IBUF_USE_ALL:
+ ut_ad(!no_counter);
+ goto check_watch;
+ case IBUF_USE_COUNT:
+ break;
+ }
+ break;
+ case IBUF_OP_DELETE:
+ switch (use) {
+ case IBUF_USE_NONE:
+ case IBUF_USE_INSERT:
+ case IBUF_USE_INSERT_DELETE_MARK:
+ return(FALSE);
+ case IBUF_USE_DELETE_MARK:
+ case IBUF_USE_DELETE:
+ case IBUF_USE_ALL:
+ ut_ad(!no_counter);
+ goto skip_watch;
+ case IBUF_USE_COUNT:
+ break;
+ }
+ break;
+ case IBUF_OP_COUNT:
break;
}
- ut_error; /* unknown value of ibuf_use */
+ /* unknown op or use */
+ ut_error;
+
+check_watch:
+ /* If a thread attempts to buffer an insert on a page while a
+ purge is in progress on the same page, the purge must not be
+ buffered, because it could remove a record that was
+ re-inserted later. For simplicity, we block the buffering of
+ all operations on a page that has a purge pending.
+
+ We do not check this in the IBUF_OP_DELETE case, because that
+ would always trigger the buffer pool watch during purge and
+ thus prevent the buffering of delete operations. We assume
+ that the issuer of IBUF_OP_DELETE has called
+ buf_pool_watch_set(space, page_no). */
+
+ {
+ buf_page_t* bpage;
+ ulint fold = buf_page_address_fold(space, page_no);
+
+ buf_pool_mutex_enter();
+ bpage = buf_page_hash_get_low(space, page_no, fold);
+ buf_pool_mutex_exit();
+
+ if (UNIV_LIKELY_NULL(bpage)) {
+ /* A buffer pool watch has been set or the
+ page has been read into the buffer pool.
+ Do not buffer the request. If a purge operation
+ is being buffered, have this request executed
+ directly on the page in the buffer pool after the
+ buffered entries for this page have been merged. */
+ return(FALSE);
+ }
+ }
-do_insert:
+skip_watch:
entry_size = rec_get_converted_size(index, entry, 0);
if (entry_size
- >= (page_get_free_space_of_empty(dict_table_is_comp(index->table))
- / 2)) {
+ >= page_get_free_space_of_empty(dict_table_is_comp(index->table))
+ / 2) {
+
return(FALSE);
}
- err = ibuf_insert_low(BTR_MODIFY_PREV, entry, entry_size,
+ err = ibuf_insert_low(BTR_MODIFY_PREV, op, no_counter,
+ entry, entry_size,
index, space, zip_size, page_no, thr);
if (err == DB_FAIL) {
- err = ibuf_insert_low(BTR_MODIFY_TREE, entry, entry_size,
+ err = ibuf_insert_low(BTR_MODIFY_TREE, op, no_counter,
+ entry, entry_size,
index, space, zip_size, page_no, thr);
}
@@ -2893,6 +3765,20 @@ ibuf_insert_to_index_page(
rec = page_rec_get_next(page_get_infimum_rec(page));
+ if (page_rec_is_supremum(rec)) {
+ /* Empty pages can result from buffered delete operations.
+ The first record from the free list can be used to find the
+ father node. */
+ rec = page_header_get_ptr(page, PAGE_FREE);
+ if (UNIV_UNLIKELY(rec == NULL)) {
+ fputs("InnoDB: Trying to insert a record from"
+ " the insert buffer to an index page\n"
+ "InnoDB: but the index page is empty!\n",
+ stderr);
+ goto dump;
+ }
+ }
+
if (UNIV_UNLIKELY(rec_get_n_fields(rec, index)
!= dtuple_get_n_fields(entry))) {
fputs("InnoDB: Trying to insert a record from"
@@ -2924,7 +3810,7 @@ dump:
rec = page_cur_get_rec(&page_cur);
page_zip = buf_block_get_page_zip(block);
- btr_cur_del_unmark_for_ibuf(rec, page_zip, mtr);
+ btr_cur_set_deleted_flag_for_ibuf(rec, page_zip, FALSE, mtr);
} else {
rec = page_cur_tuple_insert(&page_cur, entry, index, 0, mtr);
@@ -2986,6 +3872,178 @@ dump:
}
}
+/****************************************************************//**
+During merge, sets the delete mark on a record for a secondary index
+entry. */
+static
+void
+ibuf_set_del_mark(
+/*==============*/
+ const dtuple_t* entry, /*!< in: entry */
+ buf_block_t* block, /*!< in/out: block */
+ const dict_index_t* index, /*!< in: record descriptor */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ page_cur_t page_cur;
+ ulint low_match;
+
+ ut_ad(ibuf_inside());
+ ut_ad(dtuple_check_typed(entry));
+
+ low_match = page_cur_search(
+ block, index, entry, PAGE_CUR_LE, &page_cur);
+
+ if (low_match == dtuple_get_n_fields(entry)) {
+ rec_t* rec;
+ page_zip_des_t* page_zip;
+
+ rec = page_cur_get_rec(&page_cur);
+ page_zip = page_cur_get_page_zip(&page_cur);
+
+ btr_cur_set_deleted_flag_for_ibuf(rec, page_zip, TRUE, mtr);
+ } else {
+ /* This can happen benignly in some situations. */
+ }
+}
+
+/****************************************************************//**
+During merge, delete a record for a secondary index entry. */
+static
+void
+ibuf_delete(
+/*========*/
+ const dtuple_t* entry, /*!< in: entry */
+ buf_block_t* block, /*!< in/out: block */
+ dict_index_t* index, /*!< in: record descriptor */
+ mtr_t* mtr) /*!< in/out: mtr; must be committed
+ before latching any further pages */
+{
+ page_cur_t page_cur;
+ ulint low_match;
+
+ ut_ad(ibuf_inside());
+ ut_ad(dtuple_check_typed(entry));
+
+ low_match = page_cur_search(
+ block, index, entry, PAGE_CUR_LE, &page_cur);
+
+ if (low_match == dtuple_get_n_fields(entry)) {
+ page_zip_des_t* page_zip= buf_block_get_page_zip(block);
+ page_t* page = buf_block_get_frame(block);
+ rec_t* rec = page_cur_get_rec(&page_cur);
+
+ /* TODO: the below should probably be a separate function,
+ it's a bastardized version of btr_cur_optimistic_delete. */
+
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ ulint* offsets = offsets_;
+ mem_heap_t* heap = NULL;
+ ulint max_ins_size;
+
+ rec_offs_init(offsets_);
+
+ offsets = rec_get_offsets(
+ rec, index, offsets, ULINT_UNDEFINED, &heap);
+
+ /* Refuse to delete the last record. */
+ ut_a(page_get_n_recs(page) > 1);
+
+ /* The record should have been marked for deletion. */
+ ut_ad(REC_INFO_DELETED_FLAG
+ & rec_get_info_bits(rec, page_is_comp(page)));
+
+ lock_update_delete(block, rec);
+
+ if (!page_zip) {
+ max_ins_size
+ = page_get_max_insert_size_after_reorganize(
+ page, 1);
+ }
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(!page_zip || page_zip_validate(page_zip, page));
+#endif /* UNIV_ZIP_DEBUG */
+ page_cur_delete_rec(&page_cur, index, offsets, mtr);
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(!page_zip || page_zip_validate(page_zip, page));
+#endif /* UNIV_ZIP_DEBUG */
+
+ if (page_zip) {
+ ibuf_update_free_bits_zip(block, mtr);
+ } else {
+ ibuf_update_free_bits_low(block, max_ins_size, mtr);
+ }
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ } else {
+ /* This can happen benignly in some situations: either when
+ we crashed at just the right time, or on database startup
+ when we redo some old log entries (due to worse stored
+ position granularity on disk than in memory). */
+ }
+}
+
+/*********************************************************************//**
+Restores insert buffer tree cursor position
+@return TRUE if the position was restored; FALSE if not */
+static __attribute__((nonnull))
+ibool
+ibuf_restore_pos(
+/*=============*/
+ ulint space, /*!< in: space id */
+ ulint page_no,/*!< in: index page number where the record
+ should belong */
+ const dtuple_t* search_tuple,
+ /*!< in: search tuple for entries of page_no */
+ ulint mode, /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */
+ btr_pcur_t* pcur, /*!< in/out: persistent cursor whose
+ position is to be restored */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ ut_ad(mode == BTR_MODIFY_LEAF || mode == BTR_MODIFY_TREE);
+
+ if (btr_pcur_restore_position(mode, pcur, mtr)) {
+
+ return(TRUE);
+ }
+
+ if (fil_space_get_flags(space) == ULINT_UNDEFINED) {
+ /* The tablespace has been dropped. It is possible
+ that another thread has deleted the insert buffer
+ entry. Do not complain. */
+ btr_pcur_commit_specify_mtr(pcur, mtr);
+ } else {
+ fprintf(stderr,
+ "InnoDB: ERROR: Submit the output to"
+ " http://bugs.mysql.com\n"
+ "InnoDB: ibuf cursor restoration fails!\n"
+ "InnoDB: ibuf record inserted to page %lu:%lu\n",
+ (ulong) space, (ulong) page_no);
+ fflush(stderr);
+
+ rec_print_old(stderr, btr_pcur_get_rec(pcur));
+ rec_print_old(stderr, pcur->old_rec);
+ dtuple_print(stderr, search_tuple);
+
+ rec_print_old(stderr,
+ page_rec_get_next(btr_pcur_get_rec(pcur)));
+ fflush(stderr);
+
+ btr_pcur_commit_specify_mtr(pcur, mtr);
+
+ fputs("InnoDB: Validating insert buffer tree:\n", stderr);
+ if (!btr_validate_index(ibuf->index, NULL)) {
+ ut_error;
+ }
+
+ fprintf(stderr, "InnoDB: ibuf tree ok\n");
+ fflush(stderr);
+ }
+
+ return(FALSE);
+}
+
/*********************************************************************//**
Deletes from ibuf the record on which pcur is positioned. If we have to
resort to a pessimistic delete, this function commits mtr and closes
@@ -3040,41 +4098,8 @@ ibuf_delete_rec(
mtr_start(mtr);
- success = btr_pcur_restore_position(BTR_MODIFY_TREE, pcur, mtr);
-
- if (!success) {
- if (fil_space_get_flags(space) == ULINT_UNDEFINED) {
- /* The tablespace has been dropped. It is possible
- that another thread has deleted the insert buffer
- entry. Do not complain. */
- goto commit_and_exit;
- }
-
- fprintf(stderr,
- "InnoDB: ERROR: Submit the output to"
- " http://bugs.mysql.com\n"
- "InnoDB: ibuf cursor restoration fails!\n"
- "InnoDB: ibuf record inserted to page %lu\n",
- (ulong) page_no);
- fflush(stderr);
-
- rec_print_old(stderr, btr_pcur_get_rec(pcur));
- rec_print_old(stderr, pcur->old_rec);
- dtuple_print(stderr, search_tuple);
-
- rec_print_old(stderr,
- page_rec_get_next(btr_pcur_get_rec(pcur)));
- fflush(stderr);
-
- btr_pcur_commit_specify_mtr(pcur, mtr);
-
- fputs("InnoDB: Validating insert buffer tree:\n", stderr);
- if (!btr_validate_index(ibuf->index, NULL)) {
- ut_error;
- }
-
- fprintf(stderr, "InnoDB: ibuf tree ok\n");
- fflush(stderr);
+ if (!ibuf_restore_pos(space, page_no, search_tuple,
+ BTR_MODIFY_TREE, pcur, mtr)) {
goto func_exit;
}
@@ -3089,8 +4114,6 @@ ibuf_delete_rec(
ibuf_count_set(space, page_no, ibuf_count_get(space, page_no) - 1);
#endif
ibuf_size_update(root, mtr);
-
-commit_and_exit:
btr_pcur_commit_specify_mtr(pcur, mtr);
func_exit:
@@ -3103,11 +4126,11 @@ func_exit:
/*********************************************************************//**
When an index page is read from a disk to the buffer pool, this function
-inserts to the page the possible index entries buffered in the insert buffer.
-The entries are deleted from the insert buffer. If the page is not read, but
-created in the buffer pool, this function deletes its buffered entries from
-the insert buffer; there can exist entries for such a page if the page
-belonged to an index which subsequently was dropped. */
+applies any buffered operations to the page and deletes the entries from the
+insert buffer. If the page is not read, but created in the buffer pool, this
+function deletes its buffered entries from the insert buffer; there can
+exist entries for such a page if the page belonged to an index which
+subsequently was dropped. */
UNIV_INTERN
void
ibuf_merge_or_delete_for_page(
@@ -3128,15 +4151,18 @@ ibuf_merge_or_delete_for_page(
mem_heap_t* heap;
btr_pcur_t pcur;
dtuple_t* search_tuple;
- ulint n_inserts;
#ifdef UNIV_IBUF_DEBUG
- ulint volume;
+ ulint volume = 0;
#endif
page_zip_des_t* page_zip = NULL;
ibool tablespace_being_deleted = FALSE;
ibool corruption_noticed = FALSE;
mtr_t mtr;
+ /* Counts for merged & discarded operations. */
+ ulint mops[IBUF_OP_COUNT];
+ ulint dops[IBUF_OP_COUNT];
+
ut_ad(!block || buf_block_get_space(block) == space);
ut_ad(!block || buf_block_get_page_no(block) == page_no);
ut_ad(!block || buf_block_get_zip_size(block) == zip_size);
@@ -3279,10 +4305,9 @@ ibuf_merge_or_delete_for_page(
}
}
- n_inserts = 0;
-#ifdef UNIV_IBUF_DEBUG
- volume = 0;
-#endif
+ memset(mops, 0, sizeof(mops));
+ memset(dops, 0, sizeof(dops));
+
loop:
mtr_start(&mtr);
@@ -3335,33 +4360,94 @@ loop:
fputs("\nInnoDB: from the insert buffer!\n\n", stderr);
} else if (block) {
/* Now we have at pcur a record which should be
- inserted to the index page; NOTE that the call below
+ applied on the index page; NOTE that the call below
copies pointers to fields in rec, and we must
keep the latch to the rec page until the
insertion is finished! */
dtuple_t* entry;
trx_id_t max_trx_id;
dict_index_t* dummy_index;
+ ibuf_op_t op = ibuf_rec_get_op_type(rec);
max_trx_id = page_get_max_trx_id(page_align(rec));
page_update_max_trx_id(block, page_zip, max_trx_id,
&mtr);
+ ut_ad(page_validate(page_align(rec), ibuf->index));
+
entry = ibuf_build_entry_from_ibuf_rec(
rec, heap, &dummy_index);
+
+ ut_ad(page_validate(block->frame, dummy_index));
+
+ switch (op) {
+ ibool success;
+ case IBUF_OP_INSERT:
#ifdef UNIV_IBUF_DEBUG
- volume += rec_get_converted_size(dummy_index, entry, 0)
- + page_dir_calc_reserved_space(1);
- ut_a(volume <= 4 * UNIV_PAGE_SIZE
- / IBUF_PAGE_SIZE_PER_FREE_SPACE);
+ volume += rec_get_converted_size(
+ dummy_index, entry, 0);
+
+ volume += page_dir_calc_reserved_space(1);
+
+ ut_a(volume <= 4 * UNIV_PAGE_SIZE
+ / IBUF_PAGE_SIZE_PER_FREE_SPACE);
#endif
- ibuf_insert_to_index_page(entry, block,
- dummy_index, &mtr);
+ ibuf_insert_to_index_page(
+ entry, block, dummy_index, &mtr);
+ break;
+
+ case IBUF_OP_DELETE_MARK:
+ ibuf_set_del_mark(
+ entry, block, dummy_index, &mtr);
+ break;
+
+ case IBUF_OP_DELETE:
+ ibuf_delete(entry, block, dummy_index, &mtr);
+ /* Because ibuf_delete() will latch an
+ insert buffer bitmap page, commit mtr
+ before latching any further pages.
+ Store and restore the cursor position. */
+ ut_ad(rec == btr_pcur_get_rec(&pcur));
+ ut_ad(page_rec_is_user_rec(rec));
+ ut_ad(ibuf_rec_get_page_no(rec) == page_no);
+ ut_ad(ibuf_rec_get_space(rec) == space);
+
+ btr_pcur_store_position(&pcur, &mtr);
+ btr_pcur_commit_specify_mtr(&pcur, &mtr);
+
+ mtr_start(&mtr);
+
+ success = buf_page_get_known_nowait(
+ RW_X_LATCH, block,
+ BUF_KEEP_OLD,
+ __FILE__, __LINE__, &mtr);
+ ut_a(success);
+
+ buf_block_dbg_add_level(block, SYNC_TREE_NODE);
+
+ if (!ibuf_restore_pos(space, page_no,
+ search_tuple,
+ BTR_MODIFY_LEAF,
+ &pcur, &mtr)) {
+
+ mtr_commit(&mtr);
+ mops[op]++;
+ ibuf_dummy_index_free(dummy_index);
+ goto loop;
+ }
+
+ break;
+ default:
+ ut_error;
+ }
+
+ mops[op]++;
+
ibuf_dummy_index_free(dummy_index);
+ } else {
+ dops[ibuf_rec_get_op_type(rec)]++;
}
- n_inserts++;
-
/* Delete the record from ibuf */
if (ibuf_delete_rec(space, page_no, &pcur, search_tuple,
&mtr)) {
@@ -3378,12 +4464,6 @@ loop:
}
reset_bit:
-#ifdef UNIV_IBUF_COUNT_DEBUG
- if (ibuf_count_get(space, page_no) > 0) {
- /* btr_print_tree(ibuf_data->index->tree, 100);
- ibuf_print(); */
- }
-#endif
if (UNIV_LIKELY(update_ibuf_bitmap)) {
page_t* bitmap_page;
@@ -3418,7 +4498,8 @@ reset_bit:
mutex_enter(&ibuf_mutex);
ibuf->n_merges++;
- ibuf->n_merged_recs += n_inserts;
+ ibuf_add_ops(ibuf->n_merged_ops, mops);
+ ibuf_add_ops(ibuf->n_discarded_ops, dops);
mutex_exit(&ibuf_mutex);
@@ -3451,9 +4532,11 @@ ibuf_delete_for_discarded_space(
rec_t* ibuf_rec;
ulint page_no;
ibool closed;
- ulint n_inserts;
mtr_t mtr;
+ /* Counts for discarded operations. */
+ ulint dops[IBUF_OP_COUNT];
+
heap = mem_heap_create(512);
/* Use page number 0 to build the search tuple so that we get the
@@ -3461,7 +4544,7 @@ ibuf_delete_for_discarded_space(
search_tuple = ibuf_new_search_tuple_build(space, 0, heap);
- n_inserts = 0;
+ memset(dops, 0, sizeof(dops));
loop:
ibuf_enter();
@@ -3492,7 +4575,7 @@ loop:
page_no = ibuf_rec_get_page_no(ibuf_rec);
- n_inserts++;
+ dops[ibuf_rec_get_op_type(ibuf_rec)]++;
/* Delete the record from ibuf */
closed = ibuf_delete_rec(space, page_no, &pcur, search_tuple,
@@ -3522,10 +4605,7 @@ leave_loop:
/* Protect our statistics keeping from race conditions */
mutex_enter(&ibuf_mutex);
-
- ibuf->n_merges++;
- ibuf->n_merged_recs += n_inserts;
-
+ ibuf_add_ops(ibuf->n_discarded_ops, dops);
mutex_exit(&ibuf_mutex);
ibuf_exit();
@@ -3596,14 +4676,19 @@ ibuf_print(
mutex_enter(&ibuf_mutex);
fprintf(file,
- "Ibuf: size %lu, free list len %lu, seg size %lu,\n"
- "%lu inserts, %lu merged recs, %lu merges\n",
+ "Ibuf: size %lu, free list len %lu,"
+ " seg size %lu, %lu merges\n",
(ulong) ibuf->size,
(ulong) ibuf->free_list_len,
(ulong) ibuf->seg_size,
- (ulong) ibuf->n_inserts,
- (ulong) ibuf->n_merged_recs,
(ulong) ibuf->n_merges);
+
+ fputs("merged operations:\n ", file);
+ ibuf_print_ops(ibuf->n_merged_ops, file);
+
+ fputs("discarded operations:\n ", file);
+ ibuf_print_ops(ibuf->n_discarded_ops, file);
+
#ifdef UNIV_IBUF_COUNT_DEBUG
for (i = 0; i < IBUF_COUNT_N_SPACES; i++) {
for (j = 0; j < IBUF_COUNT_N_PAGES; j++) {
diff --git a/storage/innobase/include/btr0btr.h b/storage/innobase/include/btr0btr.h
index d5c8258513c..cc08cc620c5 100644
--- a/storage/innobase/include/btr0btr.h
+++ b/storage/innobase/include/btr0btr.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -68,19 +68,30 @@ enum btr_latch_mode {
BTR_MODIFY_PREV = 36
};
+/* BTR_INSERT, BTR_DELETE and BTR_DELETE_MARK are mutually exclusive. */
+
/** If this is ORed to btr_latch_mode, it means that the search tuple
-will be inserted to the index, at the searched position */
+will be inserted to the index, at the searched position.
+When the record is not in the buffer pool, try to use the insert buffer. */
#define BTR_INSERT 512
/** This flag ORed to btr_latch_mode says that we do the search in query
optimization */
#define BTR_ESTIMATE 1024
-/** This flag ORed to btr_latch_mode says that we can ignore possible
+/** This flag ORed to BTR_INSERT says that we can ignore possible
UNIQUE definition on secondary indexes when we decide if we can use
the insert buffer to speed up inserts */
#define BTR_IGNORE_SEC_UNIQUE 2048
+/** Try to delete mark the record at the searched position using the
+insert/delete buffer when the record is not in the buffer pool. */
+#define BTR_DELETE_MARK 4096
+
+/** Try to purge the record at the searched position using the insert/delete
+buffer when the record is not in the buffer pool. */
+#define BTR_DELETE 8192
+
/**************************************************************//**
Gets the root node of a tree and x-latches it.
@return root page, x-latched */
@@ -193,6 +204,10 @@ btr_leaf_page_release(
mtr_t* mtr); /*!< in: mtr */
/**************************************************************//**
Gets the child node file address in a node pointer.
+NOTE: the offsets array must contain all offsets for the record since
+we read the last field according to offsets and assume that it contains
+the child page number. In other words offsets must have been retrieved
+with rec_get_offsets(n_fields=ULINT_UNDEFINED).
@return child node address */
UNIV_INLINE
ulint
@@ -317,12 +332,16 @@ Inserts a data tuple to a tree on a non-leaf level. It is assumed
that mtr holds an x-latch on the tree. */
UNIV_INTERN
void
-btr_insert_on_non_leaf_level(
-/*=========================*/
+btr_insert_on_non_leaf_level_func(
+/*==============================*/
dict_index_t* index, /*!< in: index */
ulint level, /*!< in: level, must be > 0 */
dtuple_t* tuple, /*!< in: the record to be inserted */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
mtr_t* mtr); /*!< in: mtr */
+# define btr_insert_on_non_leaf_level(i,l,t,m) \
+ btr_insert_on_non_leaf_level_func(i,l,t,__FILE__,__LINE__,m)
#endif /* !UNIV_HOTBACKUP */
/****************************************************************//**
Sets a record as the predefined minimum record. */
diff --git a/storage/innobase/include/btr0btr.ic b/storage/innobase/include/btr0btr.ic
index 2259d22c9a6..97944cc2e26 100644
--- a/storage/innobase/include/btr0btr.ic
+++ b/storage/innobase/include/btr0btr.ic
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -255,6 +255,10 @@ btr_page_set_prev(
/**************************************************************//**
Gets the child node file address in a node pointer.
+NOTE: the offsets array must contain all offsets for the record since
+we read the last field according to offsets and assume that it contains
+the child page number. In other words offsets must have been retrieved
+with rec_get_offsets(n_fields=ULINT_UNDEFINED).
@return child node address */
UNIV_INLINE
ulint
diff --git a/storage/innobase/include/btr0cur.h b/storage/innobase/include/btr0cur.h
index 480a3877e54..136d2d068a1 100644
--- a/storage/innobase/include/btr0cur.h
+++ b/storage/innobase/include/btr0cur.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -138,7 +138,8 @@ btr_cur_search_to_nth_level(
should always be made using PAGE_CUR_LE to
search the position! */
ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ..., ORed with
- BTR_INSERT and BTR_ESTIMATE;
+ at most one of BTR_INSERT, BTR_DELETE_MARK,
+ BTR_DELETE, or BTR_ESTIMATE;
cursor->left_block is used to store a pointer
to the left neighbor page, in the cases
BTR_SEARCH_PREV and BTR_MODIFY_PREV;
@@ -152,29 +153,39 @@ btr_cur_search_to_nth_level(
ulint has_search_latch,/*!< in: latch mode the caller
currently has on btr_search_latch:
RW_S_LATCH, or 0 */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
mtr_t* mtr); /*!< in: mtr */
/*****************************************************************//**
Opens a cursor at either end of an index. */
UNIV_INTERN
void
-btr_cur_open_at_index_side(
-/*=======================*/
+btr_cur_open_at_index_side_func(
+/*============================*/
ibool from_left, /*!< in: TRUE if open to the low end,
FALSE if to the high end */
dict_index_t* index, /*!< in: index */
ulint latch_mode, /*!< in: latch mode */
btr_cur_t* cursor, /*!< in: cursor */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
mtr_t* mtr); /*!< in: mtr */
+#define btr_cur_open_at_index_side(f,i,l,c,m) \
+ btr_cur_open_at_index_side_func(f,i,l,c,__FILE__,__LINE__,m)
/**********************************************************************//**
Positions a cursor at a randomly chosen position within a B-tree. */
UNIV_INTERN
void
-btr_cur_open_at_rnd_pos(
-/*====================*/
+btr_cur_open_at_rnd_pos_func(
+/*=========================*/
dict_index_t* index, /*!< in: index */
ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */
btr_cur_t* cursor, /*!< in/out: B-tree cursor */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
mtr_t* mtr); /*!< in: mtr */
+#define btr_cur_open_at_rnd_pos(i,l,c,m) \
+ btr_cur_open_at_rnd_pos_func(i,l,c,__FILE__,__LINE__,m)
/*************************************************************//**
Tries to perform an insert to a page in an index tree, next to cursor.
It is assumed that mtr holds an x-latch on the page. The operation does
@@ -322,19 +333,6 @@ btr_cur_del_mark_set_sec_rec(
ibool val, /*!< in: value to set */
que_thr_t* thr, /*!< in: query thread */
mtr_t* mtr); /*!< in: mtr */
-/***********************************************************//**
-Clear a secondary index record's delete mark. This function is only
-used by the insert buffer insert merge mechanism. */
-UNIV_INTERN
-void
-btr_cur_del_unmark_for_ibuf(
-/*========================*/
- rec_t* rec, /*!< in/out: record to delete unmark */
- page_zip_des_t* page_zip, /*!< in/out: compressed page
- corresponding to rec, or NULL
- when the tablespace is
- uncompressed */
- mtr_t* mtr); /*!< in: mtr */
/*************************************************************//**
Tries to compress a page of the tree if it seems useful. It is assumed
that mtr holds an x-latch on the tree and on the cursor page. To avoid
@@ -586,7 +584,20 @@ btr_push_update_extern_fields(
const upd_t* update, /*!< in: update vector */
mem_heap_t* heap) /*!< in: memory heap */
__attribute__((nonnull));
-
+/***********************************************************//**
+Sets a secondary index record's delete mark to the given value. This
+function is only used by the insert buffer merge mechanism. */
+UNIV_INTERN
+void
+btr_cur_set_deleted_flag_for_ibuf(
+/*==============================*/
+ rec_t* rec, /*!< in/out: record */
+ page_zip_des_t* page_zip, /*!< in/out: compressed page
+ corresponding to rec, or NULL
+ when the tablespace is
+ uncompressed */
+ ibool val, /*!< in: value to set */
+ mtr_t* mtr); /*!< in: mtr */
/*######################################################################*/
/** In the pessimistic delete, if the page data size drops below this
@@ -618,8 +629,13 @@ enum btr_cur_method {
hash_node, and might be necessary to
update */
BTR_CUR_BINARY, /*!< success using the binary search */
- BTR_CUR_INSERT_TO_IBUF /*!< performed the intended insert to
+ BTR_CUR_INSERT_TO_IBUF, /*!< performed the intended insert to
the insert buffer */
+ BTR_CUR_DEL_MARK_IBUF, /*!< performed the intended delete
+ mark in the insert/delete buffer */
+ BTR_CUR_DELETE_IBUF, /*!< performed the intended delete in
+ the insert/delete buffer */
+ BTR_CUR_DELETE_REF /*!< row_purge_poss_sec() failed */
};
/** The tree cursor: the definition appears here only for the compiler
@@ -627,6 +643,7 @@ to know struct size! */
struct btr_cur_struct {
dict_index_t* index; /*!< index where positioned */
page_cur_t page_cur; /*!< page cursor */
+ purge_node_t* purge_node; /*!< purge node, for BTR_DELETE */
buf_block_t* left_block; /*!< this field is used to store
a pointer to the left neighbor
page, in the cases
@@ -683,6 +700,23 @@ struct btr_cur_struct {
NULL */
ulint fold; /*!< fold value used in the search if
flag is BTR_CUR_HASH */
+ /*----- Delete buffering -------*/
+ ulint ibuf_cnt; /* in searches done on insert buffer
+ trees, this contains the "counter"
+ value (the first two bytes of the
+ fourth field) extracted from the
+ page above the leaf page, from the
+ father node pointer that pointed to
+ the leaf page. in other words, it
+ contains the minimum counter value
+ for records to be inserted on the
+ chosen leaf page. If for some reason
+ this can't be read, or if the search
+ ended on the leftmost leaf page in
+ the tree (in which case the father
+ node pointer had the 'minimum
+ record' flag set), this is
+ ULINT_UNDEFINED. */
/*------------------------------*/
/* @} */
btr_path_t* path_arr; /*!< in estimating the number of
diff --git a/storage/innobase/include/btr0pcur.h b/storage/innobase/include/btr0pcur.h
index 12b1375d8b7..2334a266280 100644
--- a/storage/innobase/include/btr0pcur.h
+++ b/storage/innobase/include/btr0pcur.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -82,8 +82,8 @@ Initializes and opens a persistent cursor to an index tree. It should be
closed with btr_pcur_close. */
UNIV_INLINE
void
-btr_pcur_open(
-/*==========*/
+btr_pcur_open_func(
+/*===============*/
dict_index_t* index, /*!< in: index */
const dtuple_t* tuple, /*!< in: tuple on which search done */
ulint mode, /*!< in: PAGE_CUR_L, ...;
@@ -94,14 +94,18 @@ btr_pcur_open(
record! */
ulint latch_mode,/*!< in: BTR_SEARCH_LEAF, ... */
btr_pcur_t* cursor, /*!< in: memory buffer for persistent cursor */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
mtr_t* mtr); /*!< in: mtr */
+#define btr_pcur_open(i,t,md,l,c,m) \
+ btr_pcur_open_func(i,t,md,l,c,__FILE__,__LINE__,m)
/**************************************************************//**
Opens an persistent cursor to an index tree without initializing the
cursor. */
UNIV_INLINE
void
-btr_pcur_open_with_no_init(
-/*=======================*/
+btr_pcur_open_with_no_init_func(
+/*============================*/
dict_index_t* index, /*!< in: index */
const dtuple_t* tuple, /*!< in: tuple on which search done */
ulint mode, /*!< in: PAGE_CUR_L, ...;
@@ -119,7 +123,12 @@ btr_pcur_open_with_no_init(
ulint has_search_latch,/*!< in: latch mode the caller
currently has on btr_search_latch:
RW_S_LATCH, or 0 */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
mtr_t* mtr); /*!< in: mtr */
+#define btr_pcur_open_with_no_init(ix,t,md,l,cur,has,m) \
+ btr_pcur_open_with_no_init_func(ix,t,md,l,cur,has,__FILE__,__LINE__,m)
+
/*****************************************************************//**
Opens a persistent cursor at either end of an index. */
UNIV_INLINE
@@ -160,8 +169,8 @@ before first in tree. The latching mode must be BTR_SEARCH_LEAF or
BTR_MODIFY_LEAF. */
UNIV_INTERN
void
-btr_pcur_open_on_user_rec(
-/*======================*/
+btr_pcur_open_on_user_rec_func(
+/*===========================*/
dict_index_t* index, /*!< in: index */
const dtuple_t* tuple, /*!< in: tuple on which search done */
ulint mode, /*!< in: PAGE_CUR_L, ... */
@@ -169,17 +178,25 @@ btr_pcur_open_on_user_rec(
BTR_MODIFY_LEAF */
btr_pcur_t* cursor, /*!< in: memory buffer for persistent
cursor */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
mtr_t* mtr); /*!< in: mtr */
+#define btr_pcur_open_on_user_rec(i,t,md,l,c,m) \
+ btr_pcur_open_on_user_rec_func(i,t,md,l,c,__FILE__,__LINE__,m)
/**********************************************************************//**
Positions a cursor at a randomly chosen position within a B-tree. */
UNIV_INLINE
void
-btr_pcur_open_at_rnd_pos(
-/*=====================*/
+btr_pcur_open_at_rnd_pos_func(
+/*==========================*/
dict_index_t* index, /*!< in: index */
ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */
btr_pcur_t* cursor, /*!< in/out: B-tree pcur */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
mtr_t* mtr); /*!< in: mtr */
+#define btr_pcur_open_at_rnd_pos(i,l,c,m) \
+ btr_pcur_open_at_rnd_pos_func(i,l,c,__FILE__,__LINE__,m)
/**************************************************************//**
Frees the possible old_rec_buf buffer of a persistent cursor and sets the
latch mode of the persistent cursor to BTR_NO_LATCHES. */
@@ -218,11 +235,15 @@ record and it can be restored on a user record whose ordering fields
are identical to the ones of the original user record */
UNIV_INTERN
ibool
-btr_pcur_restore_position(
-/*======================*/
+btr_pcur_restore_position_func(
+/*===========================*/
ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */
btr_pcur_t* cursor, /*!< in: detached persistent cursor */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
mtr_t* mtr); /*!< in: mtr */
+#define btr_pcur_restore_position(l,cur,mtr) \
+ btr_pcur_restore_position_func(l,cur,__FILE__,__LINE__,mtr)
/**************************************************************//**
If the latch mode of the cursor is BTR_LEAF_SEARCH or BTR_LEAF_MODIFY,
releases the page latch and bufferfix reserved by the cursor.
@@ -260,20 +281,13 @@ btr_pcur_get_mtr(
/*=============*/
btr_pcur_t* cursor); /*!< in: persistent cursor */
/**************************************************************//**
-Commits the pcur mtr and sets the pcur latch mode to BTR_NO_LATCHES,
+Commits the mtr and sets the pcur latch mode to BTR_NO_LATCHES,
that is, the cursor becomes detached. If there have been modifications
to the page where pcur is positioned, this can be used instead of
btr_pcur_release_leaf. Function btr_pcur_store_position should be used
before calling this, if restoration of cursor is wanted later. */
UNIV_INLINE
void
-btr_pcur_commit(
-/*============*/
- btr_pcur_t* pcur); /*!< in: persistent cursor */
-/**************************************************************//**
-Differs from btr_pcur_commit in that we can specify the mtr to commit. */
-UNIV_INLINE
-void
btr_pcur_commit_specify_mtr(
/*========================*/
btr_pcur_t* pcur, /*!< in: persistent cursor */
diff --git a/storage/innobase/include/btr0pcur.ic b/storage/innobase/include/btr0pcur.ic
index 0ca7223f861..0c38797e6c5 100644
--- a/storage/innobase/include/btr0pcur.ic
+++ b/storage/innobase/include/btr0pcur.ic
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -395,30 +395,13 @@ btr_pcur_move_to_next(
}
/**************************************************************//**
-Commits the pcur mtr and sets the pcur latch mode to BTR_NO_LATCHES,
+Commits the mtr and sets the pcur latch mode to BTR_NO_LATCHES,
that is, the cursor becomes detached. If there have been modifications
to the page where pcur is positioned, this can be used instead of
btr_pcur_release_leaf. Function btr_pcur_store_position should be used
before calling this, if restoration of cursor is wanted later. */
UNIV_INLINE
void
-btr_pcur_commit(
-/*============*/
- btr_pcur_t* pcur) /*!< in: persistent cursor */
-{
- ut_a(pcur->pos_state == BTR_PCUR_IS_POSITIONED);
-
- pcur->latch_mode = BTR_NO_LATCHES;
-
- mtr_commit(pcur->mtr);
-
- pcur->pos_state = BTR_PCUR_WAS_POSITIONED;
-}
-
-/**************************************************************//**
-Differs from btr_pcur_commit in that we can specify the mtr to commit. */
-UNIV_INLINE
-void
btr_pcur_commit_specify_mtr(
/*========================*/
btr_pcur_t* pcur, /*!< in: persistent cursor */
@@ -483,8 +466,8 @@ Initializes and opens a persistent cursor to an index tree. It should be
closed with btr_pcur_close. */
UNIV_INLINE
void
-btr_pcur_open(
-/*==========*/
+btr_pcur_open_func(
+/*===============*/
dict_index_t* index, /*!< in: index */
const dtuple_t* tuple, /*!< in: tuple on which search done */
ulint mode, /*!< in: PAGE_CUR_L, ...;
@@ -495,6 +478,8 @@ btr_pcur_open(
record! */
ulint latch_mode,/*!< in: BTR_SEARCH_LEAF, ... */
btr_pcur_t* cursor, /*!< in: memory buffer for persistent cursor */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
mtr_t* mtr) /*!< in: mtr */
{
btr_cur_t* btr_cursor;
@@ -511,7 +496,7 @@ btr_pcur_open(
btr_cursor = btr_pcur_get_btr_cur(cursor);
btr_cur_search_to_nth_level(index, 0, tuple, mode, latch_mode,
- btr_cursor, 0, mtr);
+ btr_cursor, 0, file, line, mtr);
cursor->pos_state = BTR_PCUR_IS_POSITIONED;
cursor->trx_if_known = NULL;
@@ -522,8 +507,8 @@ Opens an persistent cursor to an index tree without initializing the
cursor. */
UNIV_INLINE
void
-btr_pcur_open_with_no_init(
-/*=======================*/
+btr_pcur_open_with_no_init_func(
+/*============================*/
dict_index_t* index, /*!< in: index */
const dtuple_t* tuple, /*!< in: tuple on which search done */
ulint mode, /*!< in: PAGE_CUR_L, ...;
@@ -541,6 +526,8 @@ btr_pcur_open_with_no_init(
ulint has_search_latch,/*!< in: latch mode the caller
currently has on btr_search_latch:
RW_S_LATCH, or 0 */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
mtr_t* mtr) /*!< in: mtr */
{
btr_cur_t* btr_cursor;
@@ -553,7 +540,8 @@ btr_pcur_open_with_no_init(
btr_cursor = btr_pcur_get_btr_cur(cursor);
btr_cur_search_to_nth_level(index, 0, tuple, mode, latch_mode,
- btr_cursor, has_search_latch, mtr);
+ btr_cursor, has_search_latch,
+ file, line, mtr);
cursor->pos_state = BTR_PCUR_IS_POSITIONED;
cursor->old_stored = BTR_PCUR_OLD_NOT_STORED;
@@ -600,11 +588,13 @@ btr_pcur_open_at_index_side(
Positions a cursor at a randomly chosen position within a B-tree. */
UNIV_INLINE
void
-btr_pcur_open_at_rnd_pos(
-/*=====================*/
+btr_pcur_open_at_rnd_pos_func(
+/*==========================*/
dict_index_t* index, /*!< in: index */
ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */
btr_pcur_t* cursor, /*!< in/out: B-tree pcur */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
mtr_t* mtr) /*!< in: mtr */
{
/* Initialize the cursor */
@@ -614,8 +604,9 @@ btr_pcur_open_at_rnd_pos(
btr_pcur_init(cursor);
- btr_cur_open_at_rnd_pos(index, latch_mode,
- btr_pcur_get_btr_cur(cursor), mtr);
+ btr_cur_open_at_rnd_pos_func(index, latch_mode,
+ btr_pcur_get_btr_cur(cursor),
+ file, line, mtr);
cursor->pos_state = BTR_PCUR_IS_POSITIONED;
cursor->old_stored = BTR_PCUR_OLD_NOT_STORED;
diff --git a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h
index 927ff893e39..62e4f54559a 100644
--- a/storage/innobase/include/buf0buf.h
+++ b/storage/innobase/include/buf0buf.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -34,6 +34,7 @@ Created 11/5/1995 Heikki Tuuri
#include "ut0byte.h"
#include "page0types.h"
#ifndef UNIV_HOTBACKUP
+#include "ut0rbt.h"
#include "os0proc.h"
/** @name Modes for buf_page_get_gen */
@@ -46,6 +47,10 @@ Created 11/5/1995 Heikki Tuuri
it is error-prone programming
not to set a latch, and it
should be used with care */
+#define BUF_GET_IF_IN_POOL_OR_WATCH 15
+ /*!< Get the page only if it's in the
+ buffer pool, if not then set a watch
+ on the page. */
/* @} */
/** @name Modes for buf_page_get_known_nowait */
/* @{ */
@@ -81,6 +86,8 @@ The enumeration values must be 0..7. */
enum buf_page_state {
BUF_BLOCK_ZIP_FREE = 0, /*!< contains a free
compressed page */
+ BUF_BLOCK_POOL_WATCH = 0, /*!< a sentinel for the buffer pool
+ watch, element of buf_pool_watch[] */
BUF_BLOCK_ZIP_PAGE, /*!< contains a clean
compressed page */
BUF_BLOCK_ZIP_DIRTY, /*!< contains a compressed
@@ -202,20 +209,14 @@ with care. */
#define buf_page_get_with_no_latch(SP, ZS, OF, MTR) buf_page_get_gen(\
SP, ZS, OF, RW_NO_LATCH, NULL,\
BUF_GET_NO_LATCH, __FILE__, __LINE__, MTR)
-/**************************************************************//**
-NOTE! The following macros should be used instead of
-buf_page_optimistic_get_func, to improve debugging. Only values RW_S_LATCH and
-RW_X_LATCH are allowed as LA! */
-#define buf_page_optimistic_get(LA, BL, MC, MTR) \
- buf_page_optimistic_get_func(LA, BL, MC, __FILE__, __LINE__, MTR)
/********************************************************************//**
This is the general function used to get optimistic access to a database
page.
@return TRUE if success */
UNIV_INTERN
ibool
-buf_page_optimistic_get_func(
-/*=========================*/
+buf_page_optimistic_get(
+/*====================*/
ulint rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH */
buf_block_t* block, /*!< in: guessed block */
ib_uint64_t modify_clock,/*!< in: modify clock value if mode is
@@ -291,7 +292,8 @@ buf_page_get_gen(
ulint rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH */
buf_block_t* guess, /*!< in: guessed block or NULL */
ulint mode, /*!< in: BUF_GET, BUF_GET_IF_IN_POOL,
- BUF_GET_NO_LATCH */
+ BUF_GET_NO_LATCH or
+ BUF_GET_IF_IN_POOL_OR_WATCH */
const char* file, /*!< in: file name */
ulint line, /*!< in: line where called */
mtr_t* mtr); /*!< in: mini-transaction */
@@ -341,9 +343,8 @@ void
buf_page_release(
/*=============*/
buf_block_t* block, /*!< in: buffer block */
- ulint rw_latch, /*!< in: RW_S_LATCH, RW_X_LATCH,
+ ulint rw_latch); /*!< in: RW_S_LATCH, RW_X_LATCH,
RW_NO_LATCH */
- mtr_t* mtr); /*!< in: mtr */
/********************************************************************//**
Moves a page to the start of the buffer pool LRU list. This high-level
function can be used to prevent an important page from slipping out of
@@ -995,6 +996,16 @@ Returns the control block of a file page, NULL if not found.
@return block, NULL if not found */
UNIV_INLINE
buf_page_t*
+buf_page_hash_get_low(
+/*==================*/
+ ulint space, /*!< in: space id */
+ ulint offset, /*!< in: offset of the page within space */
+ ulint fold); /*!< in: buf_page_address_fold(space, offset) */
+/******************************************************************//**
+Returns the control block of a file page, NULL if not found.
+@return block, NULL if not found or not a real control block */
+UNIV_INLINE
+buf_page_t*
buf_page_hash_get(
/*==============*/
ulint space, /*!< in: space id */
@@ -1016,8 +1027,49 @@ UNIV_INTERN
ulint
buf_get_free_list_len(void);
/*=======================*/
-#endif /* !UNIV_HOTBACKUP */
+/********************************************************************
+Determine if a block is a sentinel for a buffer pool watch.
+@return TRUE if a sentinel for a buffer pool watch, FALSE if not */
+UNIV_INTERN
+ibool
+buf_pool_watch_is(
+/*==============*/
+ const buf_page_t* bpage) /*!< in: block */
+ __attribute__((nonnull, warn_unused_result));
+/****************************************************************//**
+Add watch for the given page to be read in. Caller must have the buffer pool
+@return NULL if watch set, block if the page is in the buffer pool */
+UNIV_INTERN
+buf_page_t*
+buf_pool_watch_set(
+/*===============*/
+ ulint space, /*!< in: space id */
+ ulint offset, /*!< in: page number */
+ ulint fold) /*!< in: buf_page_address_fold(space, offset) */
+ __attribute__((warn_unused_result));
+/****************************************************************//**
+Stop watching if the page has been read in.
+buf_pool_watch_set(space,offset) must have returned NULL before. */
+UNIV_INTERN
+void
+buf_pool_watch_unset(
+/*=================*/
+ ulint space, /*!< in: space id */
+ ulint offset);/*!< in: page number */
+/****************************************************************//**
+Check if the page has been read in.
+This may only be called after buf_pool_watch_set(space,offset)
+has returned NULL and before invoking buf_pool_watch_unset(space,offset).
+@return FALSE if the given page was not read in, TRUE if it was */
+UNIV_INTERN
+ibool
+buf_pool_watch_occurred(
+/*====================*/
+ ulint space, /*!< in: space id */
+ ulint offset) /*!< in: page number */
+ __attribute__((warn_unused_result));
+#endif /* !UNIV_HOTBACKUP */
/** The common buffer control block structure
for compressed and uncompressed frames */
@@ -1057,7 +1109,10 @@ struct buf_page_struct{
#endif /* !UNIV_HOTBACKUP */
page_zip_des_t zip; /*!< compressed page; zip.data
(but not the data it points to) is
- also protected by buf_pool_mutex */
+ also protected by buf_pool_mutex;
+ state == BUF_BLOCK_ZIP_PAGE and
+ zip.data == NULL means an active
+ buf_pool_watch */
#ifndef UNIV_HOTBACKUP
buf_page_t* hash; /*!< node used in chaining to
buf_pool->page_hash or
@@ -1073,8 +1128,9 @@ struct buf_page_struct{
UT_LIST_NODE_T(buf_page_t) list;
/*!< based on state, this is a
- list node, protected only by
- buf_pool_mutex, in one of the
+ list node, protected either by
+ buf_pool_mutex or by
+ flush_list_mutex, in one of the
following lists in buf_pool:
- BUF_BLOCK_NOT_USED: free
@@ -1083,6 +1139,12 @@ struct buf_page_struct{
- BUF_BLOCK_ZIP_PAGE: zip_clean
- BUF_BLOCK_ZIP_FREE: zip_free[]
+ If bpage is part of flush_list
+ then the node pointers are
+ covered by flush_list_mutex.
+ Otherwise these pointers are
+ protected by buf_pool_mutex.
+
The contents of the list node
is undefined if !in_flush_list
&& state == BUF_BLOCK_FILE_PAGE,
@@ -1093,10 +1155,15 @@ struct buf_page_struct{
#ifdef UNIV_DEBUG
ibool in_flush_list; /*!< TRUE if in buf_pool->flush_list;
- when buf_pool_mutex is free, the
+ when flush_list_mutex is free, the
following should hold: in_flush_list
== (state == BUF_BLOCK_FILE_PAGE
- || state == BUF_BLOCK_ZIP_DIRTY) */
+ || state == BUF_BLOCK_ZIP_DIRTY)
+ Writes to this field must be
+ covered by both block->mutex
+ and flush_list_mutex. Hence
+ reads can happen while holding
+ any one of the two mutexes */
ibool in_free_list; /*!< TRUE if in buf_pool->free; when
buf_pool_mutex is free, the following
should hold: in_free_list
@@ -1106,7 +1173,8 @@ struct buf_page_struct{
/*!< log sequence number of
the youngest modification to
this block, zero if not
- modified */
+ modified. Protected by block
+ mutex */
ib_uint64_t oldest_modification;
/*!< log sequence number of
the START of the log entry
@@ -1114,7 +1182,12 @@ struct buf_page_struct{
modification to this block
which has not yet been flushed
on disk; zero if all
- modifications are on disk */
+ modifications are on disk.
+ Writes to this field must be
+ covered by both block->mutex
+ and flush_list_mutex. Hence
+ reads can happen while holding
+ any one of the two mutexes */
/* @} */
/** @name LRU replacement algorithm fields
These fields are protected by buf_pool_mutex only (not
@@ -1185,15 +1258,21 @@ struct buf_block_struct{
rw_lock_t lock; /*!< read-write lock of the buffer
frame */
unsigned lock_hash_val:32;/*!< hashed value of the page address
- in the record lock hash table */
- unsigned check_index_page_at_flush:1;
+ in the record lock hash table;
+ protected by buf_block_t::lock
+ (or buf_block_t::mutex, buf_pool_mutex
+ in buf_page_get_gen(),
+ buf_page_init_for_read()
+ and buf_page_create()) */
+ ibool check_index_page_at_flush;
/*!< TRUE if we know that this is
an index page, and want the database
to check its consistency before flush;
note that there may be pages in the
buffer pool which are index pages,
but this flag is not set because
- we do not keep track of all pages */
+ we do not keep track of all pages;
+ NOT protected by any mutex */
/* @} */
/** @name Optimistic search field */
/* @{ */
@@ -1346,6 +1425,21 @@ struct buf_pool_struct{
/* @{ */
+ mutex_t flush_list_mutex;/*!< mutex protecting the
+ flush list access. This mutex
+ protects flush_list, flush_rbt
+ and bpage::list pointers when
+ the bpage is on flush_list. It
+ also protects writes to
+ bpage::oldest_modification */
+ mutex_t flush_order_mutex;/*!< mutex to serialize access to
+ the flush list when we are putting
+ dirty blocks in the list. The idea
+ behind this mutex is to be able
+ to release log_sys->mutex during
+ mtr_commit and still ensure that
+ insertions in the flush_list happen
+ in the LSN order. */
UT_LIST_BASE_NODE_T(buf_page_t) flush_list;
/*!< base node of the modified block
list */
@@ -1359,6 +1453,20 @@ struct buf_pool_struct{
/*!< this is in the set state
when there is no flush batch
of the given type running */
+ ib_rbt_t* flush_rbt; /*!< a red-black tree is used
+ exclusively during recovery to
+ speed up insertions in the
+ flush_list. This tree contains
+ blocks in order of
+ oldest_modification LSN and is
+ kept in sync with the
+ flush_list.
+ Each member of the tree MUST
+ also be on the flush_list.
+ This tree is relevant only in
+ recovery and is set to NULL
+ once the recovery is over.
+ Protected by flush_list_mutex */
ulint freed_page_clock;/*!< a sequence number used
to count the number of buffer
blocks removed from the end of
@@ -1372,8 +1480,8 @@ struct buf_pool_struct{
this is incremented by one; this is
set to zero when a buffer block is
allocated */
-
/* @} */
+
/** @name LRU replacement algorithm fields */
/* @{ */
@@ -1439,6 +1547,31 @@ Use these instead of accessing buf_pool_mutex directly. */
mutex_enter(&buf_pool_mutex); \
} while (0)
+/** Test if flush list mutex is owned. */
+#define buf_flush_list_mutex_own() mutex_own(&buf_pool->flush_list_mutex)
+
+/** Acquire the flush list mutex. */
+#define buf_flush_list_mutex_enter() do { \
+ mutex_enter(&buf_pool->flush_list_mutex); \
+} while (0)
+/** Release the flush list mutex. */
+# define buf_flush_list_mutex_exit() do { \
+ mutex_exit(&buf_pool->flush_list_mutex); \
+} while (0)
+
+/** Test if flush order mutex is owned. */
+#define buf_flush_order_mutex_own() mutex_own(&buf_pool->flush_order_mutex)
+
+/** Acquire the flush order mutex. */
+#define buf_flush_order_mutex_enter() do { \
+ mutex_enter(&buf_pool->flush_order_mutex); \
+} while (0)
+/** Release the flush order mutex. */
+# define buf_flush_order_mutex_exit() do { \
+ mutex_exit(&buf_pool->flush_order_mutex); \
+} while (0)
+
+
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
/** Flag to forbid the release of the buffer pool mutex.
Protected by buf_pool_mutex. */
diff --git a/storage/innobase/include/buf0buf.ic b/storage/innobase/include/buf0buf.ic
index 0f92a59a1c7..b9a9662fdc5 100644
--- a/storage/innobase/include/buf0buf.ic
+++ b/storage/innobase/include/buf0buf.ic
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
Copyright (c) 2008, Google Inc.
Portions of this file contain modifications contributed and copyrighted by
@@ -81,7 +81,7 @@ buf_page_peek_if_too_old(
unsigned access_time = buf_page_is_accessed(bpage);
if (access_time > 0
- && (ut_time_ms() - access_time)
+ && ((ib_uint32_t) (ut_time_ms() - access_time))
>= buf_LRU_old_threshold_ms) {
return(TRUE);
}
@@ -121,7 +121,7 @@ buf_pool_get_oldest_modification(void)
buf_page_t* bpage;
ib_uint64_t lsn;
- buf_pool_mutex_enter();
+ buf_flush_list_mutex_enter();
bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
@@ -132,7 +132,7 @@ buf_pool_get_oldest_modification(void)
lsn = bpage->oldest_modification;
}
- buf_pool_mutex_exit();
+ buf_flush_list_mutex_exit();
/* The returned answer may be out of date: the flush_list can
change after the mutex has been released. */
@@ -705,6 +705,12 @@ buf_block_get_lock_hash_val(
/*========================*/
const buf_block_t* block) /*!< in: block */
{
+ ut_ad(block);
+ ut_ad(buf_page_in_file(&block->page));
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(rw_lock_own(&(((buf_block_t*) block)->lock), RW_LOCK_EXCLUSIVE)
+ || rw_lock_own(&(((buf_block_t*) block)->lock), RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
return(block->lock_hash_val);
}
@@ -902,21 +908,20 @@ Returns the control block of a file page, NULL if not found.
@return block, NULL if not found */
UNIV_INLINE
buf_page_t*
-buf_page_hash_get(
-/*==============*/
+buf_page_hash_get_low(
+/*==================*/
ulint space, /*!< in: space id */
- ulint offset) /*!< in: offset of the page within space */
+ ulint offset, /*!< in: offset of the page within space */
+ ulint fold) /*!< in: buf_page_address_fold(space, offset) */
{
buf_page_t* bpage;
- ulint fold;
ut_ad(buf_pool);
ut_ad(buf_pool_mutex_own());
+ ut_ad(fold == buf_page_address_fold(space, offset));
/* Look for the page in the hash table */
- fold = buf_page_address_fold(space, offset);
-
HASH_SEARCH(hash, buf_pool->page_hash, fold, buf_page_t*, bpage,
ut_ad(bpage->in_page_hash && !bpage->in_zip_hash
&& buf_page_in_file(bpage)),
@@ -932,6 +937,26 @@ buf_page_hash_get(
}
/******************************************************************//**
+Returns the control block of a file page, NULL if not found.
+@return block, NULL if not found or not a real control block */
+UNIV_INLINE
+buf_page_t*
+buf_page_hash_get(
+/*==============*/
+ ulint space, /*!< in: space id */
+ ulint offset) /*!< in: offset of the page within space */
+{
+ ulint fold = buf_page_address_fold(space, offset);
+ buf_page_t* bpage = buf_page_hash_get_low(space, offset, fold);
+
+ if (bpage && UNIV_UNLIKELY(buf_pool_watch_is(bpage))) {
+ bpage = NULL;
+ }
+
+ return(bpage);
+}
+
+/******************************************************************//**
Returns the control block of a file page, NULL if not found
or an uncompressed page frame does not exist.
@return block, NULL if not found */
@@ -942,7 +967,11 @@ buf_block_hash_get(
ulint space, /*!< in: space id */
ulint offset) /*!< in: offset of the page within space */
{
- return(buf_page_get_block(buf_page_hash_get(space, offset)));
+ buf_block_t* block;
+
+ block = buf_page_get_block(buf_page_hash_get(space, offset));
+
+ return(block);
}
/********************************************************************//**
@@ -1018,21 +1047,14 @@ void
buf_page_release(
/*=============*/
buf_block_t* block, /*!< in: buffer block */
- ulint rw_latch, /*!< in: RW_S_LATCH, RW_X_LATCH,
+ ulint rw_latch) /*!< in: RW_S_LATCH, RW_X_LATCH,
RW_NO_LATCH */
- mtr_t* mtr) /*!< in: mtr */
{
ut_ad(block);
ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
ut_a(block->page.buf_fix_count > 0);
- if (rw_latch == RW_X_LATCH && mtr->modifications) {
- buf_pool_mutex_enter();
- buf_flush_note_modification(block, mtr);
- buf_pool_mutex_exit();
- }
-
mutex_enter(&block->mutex);
#ifdef UNIV_SYNC_DEBUG
diff --git a/storage/innobase/include/buf0flu.h b/storage/innobase/include/buf0flu.h
index 6c751852f54..c76fcace46e 100644
--- a/storage/innobase/include/buf0flu.h
+++ b/storage/innobase/include/buf0flu.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -39,6 +39,16 @@ void
buf_flush_remove(
/*=============*/
buf_page_t* bpage); /*!< in: pointer to the block in question */
+/*******************************************************************//**
+Relocates a buffer control block on the flush_list.
+Note that it is assumed that the contents of bpage has already been
+copied to dpage. */
+UNIV_INTERN
+void
+buf_flush_relocate_on_flush_list(
+/*=============================*/
+ buf_page_t* bpage, /*!< in/out: control block being moved */
+ buf_page_t* dpage); /*!< in/out: destination block */
/********************************************************************//**
Updates the flush system data structures when a write is completed. */
UNIV_INTERN
@@ -175,6 +185,22 @@ buf_flush_validate(void);
/*====================*/
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+/********************************************************************//**
+Initialize the red-black tree to speed up insertions into the flush_list
+during recovery process. Should be called at the start of recovery
+process before any page has been read/written. */
+UNIV_INTERN
+void
+buf_flush_init_flush_rbt(void);
+/*==========================*/
+
+/********************************************************************//**
+Frees up the red-black tree. */
+UNIV_INTERN
+void
+buf_flush_free_flush_rbt(void);
+/*==========================*/
+
/** When buf_flush_free_margin is called, it tries to make this many blocks
available to replacement in the free list and at the end of the LRU list (to
make sure that a read-ahead batch can be read efficiently in a single
diff --git a/storage/innobase/include/buf0flu.ic b/storage/innobase/include/buf0flu.ic
index c90cd59e4b6..fb71932e453 100644
--- a/storage/innobase/include/buf0flu.ic
+++ b/storage/innobase/include/buf0flu.ic
@@ -33,7 +33,8 @@ UNIV_INTERN
void
buf_flush_insert_into_flush_list(
/*=============================*/
- buf_block_t* block); /*!< in/out: block which is modified */
+ buf_block_t* block, /*!< in/out: block which is modified */
+ ib_uint64_t lsn); /*!< in: oldest modification */
/********************************************************************//**
Inserts a modified block into the flush list in the right sorted position.
This function is used by recovery, because there the modifications do not
@@ -42,7 +43,8 @@ UNIV_INTERN
void
buf_flush_insert_sorted_into_flush_list(
/*====================================*/
- buf_block_t* block); /*!< in/out: block which is modified */
+ buf_block_t* block, /*!< in/out: block which is modified */
+ ib_uint64_t lsn); /*!< in: oldest modification */
/********************************************************************//**
This function should be called at a mini-transaction commit, if a page was
@@ -61,24 +63,27 @@ buf_flush_note_modification(
#ifdef UNIV_SYNC_DEBUG
ut_ad(rw_lock_own(&(block->lock), RW_LOCK_EX));
#endif /* UNIV_SYNC_DEBUG */
- ut_ad(buf_pool_mutex_own());
+
+ ut_ad(!buf_pool_mutex_own());
+ ut_ad(!buf_flush_list_mutex_own());
+ ut_ad(buf_flush_order_mutex_own());
ut_ad(mtr->start_lsn != 0);
ut_ad(mtr->modifications);
+
+ mutex_enter(&block->mutex);
ut_ad(block->page.newest_modification <= mtr->end_lsn);
block->page.newest_modification = mtr->end_lsn;
if (!block->page.oldest_modification) {
-
- block->page.oldest_modification = mtr->start_lsn;
- ut_ad(block->page.oldest_modification != 0);
-
- buf_flush_insert_into_flush_list(block);
+ buf_flush_insert_into_flush_list(block, mtr->start_lsn);
} else {
ut_ad(block->page.oldest_modification <= mtr->start_lsn);
}
+ mutex_exit(&block->mutex);
+
++srv_buf_pool_write_requests;
}
@@ -101,23 +106,23 @@ buf_flush_recv_note_modification(
ut_ad(rw_lock_own(&(block->lock), RW_LOCK_EX));
#endif /* UNIV_SYNC_DEBUG */
- buf_pool_mutex_enter();
+ ut_ad(!buf_pool_mutex_own());
+ ut_ad(!buf_flush_list_mutex_own());
+ ut_ad(buf_flush_order_mutex_own());
+ ut_ad(start_lsn != 0);
ut_ad(block->page.newest_modification <= end_lsn);
+ mutex_enter(&block->mutex);
block->page.newest_modification = end_lsn;
if (!block->page.oldest_modification) {
-
- block->page.oldest_modification = start_lsn;
-
- ut_ad(block->page.oldest_modification != 0);
-
- buf_flush_insert_sorted_into_flush_list(block);
+ buf_flush_insert_sorted_into_flush_list(block, start_lsn);
} else {
ut_ad(block->page.oldest_modification <= start_lsn);
}
- buf_pool_mutex_exit();
+ mutex_exit(&block->mutex);
+
}
#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/include/data0type.ic b/storage/innobase/include/data0type.ic
index 240b4288f39..2bf67a941bd 100644
--- a/storage/innobase/include/data0type.ic
+++ b/storage/innobase/include/data0type.ic
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -285,6 +285,10 @@ dtype_new_store_for_order_and_null_size(
#endif
ulint len;
+ ut_ad(type);
+ ut_ad(type->mtype >= DATA_VARCHAR);
+ ut_ad(type->mtype <= DATA_MYSQL);
+
buf[0] = (byte)(type->mtype & 0xFFUL);
if (type->prtype & DATA_BINARY_TYPE) {
diff --git a/storage/innobase/include/db0err.h b/storage/innobase/include/db0err.h
index 747e9b5364e..d339eb73fb9 100644
--- a/storage/innobase/include/db0err.h
+++ b/storage/innobase/include/db0err.h
@@ -93,6 +93,13 @@ enum db_err {
DB_PRIMARY_KEY_IS_NULL, /* a column in the PRIMARY KEY
was found to be NULL */
+ DB_STATS_DO_NOT_EXIST, /* an operation that requires the
+ persistent storage, used for recording
+ table and index statistics, was
+ requested but this storage does not
+ exist itself or the stats for a given
+ table do not exist */
+
/* The following are partial failure codes */
DB_FAIL = 1000,
DB_OVERFLOW,
diff --git a/storage/innobase/include/dict0boot.h b/storage/innobase/include/dict0boot.h
index 51d37ee98d1..e01fafe652d 100644
--- a/storage/innobase/include/dict0boot.h
+++ b/storage/innobase/include/dict0boot.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -137,6 +137,7 @@ clustered index */
#define DICT_SYS_INDEXES_PAGE_NO_FIELD 8
#define DICT_SYS_INDEXES_SPACE_NO_FIELD 7
#define DICT_SYS_INDEXES_TYPE_FIELD 6
+#define DICT_SYS_INDEXES_NAME_FIELD 3
/* When a row id which is zero modulo this number (which must be a power of
two) is assigned, the field DICT_HDR_ROW_ID on the dictionary header page is
diff --git a/storage/innobase/include/dict0dict.h b/storage/innobase/include/dict0dict.h
index 12396556c2d..788616d682a 100644
--- a/storage/innobase/include/dict0dict.h
+++ b/storage/innobase/include/dict0dict.h
@@ -928,9 +928,10 @@ UNIV_INTERN
void
dict_table_check_for_dup_indexes(
/*=============================*/
- const dict_table_t* table); /*!< in: Check for dup indexes
+ const dict_table_t* table, /*!< in: Check for dup indexes
in this table */
-
+ ibool tmp_ok);/*!< in: TRUE=allow temporary
+ index names */
#endif /* UNIV_DEBUG */
/**********************************************************************//**
Builds a node pointer out of a physical record and a page number.
diff --git a/storage/innobase/include/dict0mem.h b/storage/innobase/include/dict0mem.h
index 2d001111938..e63fe920daa 100644
--- a/storage/innobase/include/dict0mem.h
+++ b/storage/innobase/include/dict0mem.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -80,21 +80,39 @@ combination of types */
/** File format */
/* @{ */
#define DICT_TF_FORMAT_SHIFT 5 /* file format */
-#define DICT_TF_FORMAT_MASK (127 << DICT_TF_FORMAT_SHIFT)
+#define DICT_TF_FORMAT_MASK \
+((~(~0 << (DICT_TF_BITS - DICT_TF_FORMAT_SHIFT))) << DICT_TF_FORMAT_SHIFT)
#define DICT_TF_FORMAT_51 0 /*!< InnoDB/MySQL up to 5.1 */
#define DICT_TF_FORMAT_ZIP 1 /*!< InnoDB plugin for 5.1:
compressed tables,
new BLOB treatment */
/** Maximum supported file format */
#define DICT_TF_FORMAT_MAX DICT_TF_FORMAT_ZIP
-
+/* @} */
#define DICT_TF_BITS 6 /*!< number of flag bits */
#if (1 << (DICT_TF_BITS - DICT_TF_FORMAT_SHIFT)) <= DICT_TF_FORMAT_MAX
# error "DICT_TF_BITS is insufficient for DICT_TF_FORMAT_MAX"
#endif
/* @} */
+
+/** @brief Additional table flags.
+
+These flags will be stored in SYS_TABLES.MIX_LEN. All unused flags
+will be written as 0. The column may contain garbage for tables
+created with old versions of InnoDB that only implemented
+ROW_FORMAT=REDUNDANT. */
+/* @{ */
+#define DICT_TF2_SHIFT DICT_TF_BITS
+ /*!< Shift value for
+ table->flags. */
+#define DICT_TF2_TEMPORARY 1 /*!< TRUE for tables from
+ CREATE TEMPORARY TABLE. */
+#define DICT_TF2_BITS (DICT_TF2_SHIFT + 1)
+ /*!< Total number of bits
+ in table->flags. */
/* @} */
+
/**********************************************************************//**
Creates a table memory object.
@return own: table object */
@@ -374,7 +392,7 @@ struct dict_table_struct{
unsigned space:32;
/*!< space where the clustered index of the
table is placed */
- unsigned flags:DICT_TF_BITS;/*!< DICT_TF_COMPACT, ... */
+ unsigned flags:DICT_TF2_BITS;/*!< DICT_TF_COMPACT, ... */
unsigned ibd_file_missing:1;
/*!< TRUE if this is in a single-table
tablespace and the .ibd file is missing; then
diff --git a/storage/innobase/include/fil0fil.h b/storage/innobase/include/fil0fil.h
index 74d0fbcdacd..36660d9845b 100644
--- a/storage/innobase/include/fil0fil.h
+++ b/storage/innobase/include/fil0fil.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -26,13 +26,13 @@ Created 10/25/1995 Heikki Tuuri
#ifndef fil0fil_h
#define fil0fil_h
-#include "univ.i"
-#ifndef UNIV_HOTBACKUP
-#include "sync0rw.h"
-#endif /* !UNIV_HOTBACKUP */
#include "dict0types.h"
#include "ut0byte.h"
#include "os0file.h"
+#ifndef UNIV_HOTBACKUP
+#include "sync0rw.h"
+#include "ibuf0types.h"
+#endif /* !UNIV_HOTBACKUP */
/** When mysqld is run, the default directory "." is the mysqld datadir,
but in the MySQL Embedded Server Library and ibbackup it is not the default
@@ -110,9 +110,10 @@ extern fil_addr_t fil_addr_null;
contents of this field is valid
for all uncompressed pages. */
#define FIL_PAGE_FILE_FLUSH_LSN 26 /*!< this is only defined for the
- first page in a data file: the file
- has been flushed to disk at least up
- to this lsn */
+ first page in a system tablespace
+ data file (ibdata*, not *.ibd):
+ the file has been flushed to disk
+ at least up to this lsn */
#define FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID 34 /*!< starting from 4.1.x this
contains the space id of the page */
#define FIL_PAGE_DATA 38 /*!< start of the data on the page */
diff --git a/storage/innobase/include/ha_prototypes.h b/storage/innobase/include/ha_prototypes.h
index b737a00b3dc..9725ef05ad8 100644
--- a/storage/innobase/include/ha_prototypes.h
+++ b/storage/innobase/include/ha_prototypes.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2006, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
diff --git a/storage/innobase/include/handler0alter.h b/storage/innobase/include/handler0alter.h
index 7f5af6d2e76..017fe88d533 100644
--- a/storage/innobase/include/handler0alter.h
+++ b/storage/innobase/include/handler0alter.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 2005, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2005, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
diff --git a/storage/innobase/include/hash0hash.h b/storage/innobase/include/hash0hash.h
index 977cb829f35..b17c21a45ef 100644
--- a/storage/innobase/include/hash0hash.h
+++ b/storage/innobase/include/hash0hash.h
@@ -434,11 +434,12 @@ struct hash_table_struct {
these heaps */
#endif /* !UNIV_HOTBACKUP */
mem_heap_t* heap;
+#ifdef UNIV_DEBUG
ulint magic_n;
+# define HASH_TABLE_MAGIC_N 76561114
+#endif /* UNIV_DEBUG */
};
-#define HASH_TABLE_MAGIC_N 76561114
-
#ifndef UNIV_NONINL
#include "hash0hash.ic"
#endif
diff --git a/storage/innobase/include/hash0hash.ic b/storage/innobase/include/hash0hash.ic
index 19da2d50701..0b437894e2e 100644
--- a/storage/innobase/include/hash0hash.ic
+++ b/storage/innobase/include/hash0hash.ic
@@ -35,6 +35,8 @@ hash_get_nth_cell(
hash_table_t* table, /*!< in: hash table */
ulint n) /*!< in: cell index */
{
+ ut_ad(table);
+ ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
ut_ad(n < table->n_cells);
return(table->array + n);
@@ -48,6 +50,8 @@ hash_table_clear(
/*=============*/
hash_table_t* table) /*!< in/out: hash table */
{
+ ut_ad(table);
+ ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
memset(table->array, 0x0,
table->n_cells * sizeof(*table->array));
}
@@ -61,6 +65,8 @@ hash_get_n_cells(
/*=============*/
hash_table_t* table) /*!< in: table */
{
+ ut_ad(table);
+ ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
return(table->n_cells);
}
@@ -74,6 +80,8 @@ hash_calc_hash(
ulint fold, /*!< in: folded value */
hash_table_t* table) /*!< in: hash table */
{
+ ut_ad(table);
+ ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
return(ut_hash_ulint(fold, table->n_cells));
}
@@ -88,6 +96,8 @@ hash_get_mutex_no(
hash_table_t* table, /*!< in: hash table */
ulint fold) /*!< in: fold */
{
+ ut_ad(table);
+ ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
ut_ad(ut_is_2pow(table->n_mutexes));
return(ut_2pow_remainder(hash_calc_hash(fold, table),
table->n_mutexes));
@@ -103,6 +113,8 @@ hash_get_nth_heap(
hash_table_t* table, /*!< in: hash table */
ulint i) /*!< in: index of the heap */
{
+ ut_ad(table);
+ ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
ut_ad(i < table->n_mutexes);
return(table->heaps[i]);
@@ -120,6 +132,9 @@ hash_get_heap(
{
ulint i;
+ ut_ad(table);
+ ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
+
if (table->heap) {
return(table->heap);
}
@@ -139,6 +154,8 @@ hash_get_nth_mutex(
hash_table_t* table, /*!< in: hash table */
ulint i) /*!< in: index of the mutex */
{
+ ut_ad(table);
+ ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
ut_ad(i < table->n_mutexes);
return(table->mutexes + i);
@@ -156,6 +173,9 @@ hash_get_mutex(
{
ulint i;
+ ut_ad(table);
+ ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
+
i = hash_get_mutex_no(table, fold);
return(hash_get_nth_mutex(table, i));
diff --git a/storage/innobase/include/ibuf0ibuf.h b/storage/innobase/include/ibuf0ibuf.h
index 8aa21fb9d95..0f1631fde77 100644
--- a/storage/innobase/include/ibuf0ibuf.h
+++ b/storage/innobase/include/ibuf0ibuf.h
@@ -35,12 +35,27 @@ Created 7/19/1997 Heikki Tuuri
#ifndef UNIV_HOTBACKUP
# include "ibuf0types.h"
+/* Possible operations buffered in the insert/whatever buffer. See
+ibuf_insert(). DO NOT CHANGE THE VALUES OF THESE, THEY ARE STORED ON DISK. */
+typedef enum {
+ IBUF_OP_INSERT = 0,
+ IBUF_OP_DELETE_MARK = 1,
+ IBUF_OP_DELETE = 2,
+
+ /* Number of different operation types. */
+ IBUF_OP_COUNT = 3,
+} ibuf_op_t;
+
/** Combinations of operations that can be buffered. Because the enum
values are used for indexing innobase_change_buffering_values[], they
should start at 0 and there should not be any gaps. */
typedef enum {
IBUF_USE_NONE = 0,
IBUF_USE_INSERT, /* insert */
+ IBUF_USE_DELETE_MARK, /* delete */
+ IBUF_USE_INSERT_DELETE_MARK, /* insert+delete */
+ IBUF_USE_DELETE, /* delete+purge */
+ IBUF_USE_ALL, /* insert+delete+purge */
IBUF_USE_COUNT /* number of entries in ibuf_use_t */
} ibuf_use_t;
@@ -72,8 +87,7 @@ separately committed mini-transaction, because in crash recovery, the
free bits could momentarily be set too high. */
/******************************************************************//**
-Creates the insert buffer data structure at a database startup and
-initializes the data structures for the insert buffer of each tablespace. */
+Creates the insert buffer data structure at a database startup. */
UNIV_INTERN
void
ibuf_init_at_db_start(void);
@@ -243,14 +257,15 @@ void
ibuf_free_excess_pages(void);
/*========================*/
/*********************************************************************//**
-Makes an index insert to the insert buffer, instead of directly to the disk
-page, if this is possible. Does not do insert if the index is clustered
-or unique.
+Buffer an operation in the insert/delete buffer, instead of doing it
+directly to the disk page, if this is possible. Does not do it if the index
+is clustered or unique.
@return TRUE if success */
UNIV_INTERN
ibool
ibuf_insert(
/*========*/
+ ibuf_op_t op, /*!< in: operation type */
const dtuple_t* entry, /*!< in: index entry to insert */
dict_index_t* index, /*!< in: index where to insert */
ulint space, /*!< in: space id where to insert */
@@ -259,11 +274,11 @@ ibuf_insert(
que_thr_t* thr); /*!< in: query thread */
/*********************************************************************//**
When an index page is read from a disk to the buffer pool, this function
-inserts to the page the possible index entries buffered in the insert buffer.
-The entries are deleted from the insert buffer. If the page is not read, but
-created in the buffer pool, this function deletes its buffered entries from
-the insert buffer; there can exist entries for such a page if the page
-belonged to an index which subsequently was dropped. */
+applies any buffered operations to the page and deletes the entries from the
+insert buffer. If the page is not read, but created in the buffer pool, this
+function deletes its buffered entries from the insert buffer; there can
+exist entries for such a page if the page belonged to an index which
+subsequently was dropped. */
UNIV_INTERN
void
ibuf_merge_or_delete_for_page(
@@ -356,6 +371,15 @@ void
ibuf_print(
/*=======*/
FILE* file); /*!< in: file where to print */
+/********************************************************************
+Read the first two bytes from a record's fourth field (counter field in new
+records; something else in older records).
+@return "counter" field, or ULINT_UNDEFINED if for some reason it can't be read */
+UNIV_INTERN
+ulint
+ibuf_rec_get_counter(
+/*=================*/
+ const rec_t* rec); /*!< in: ibuf record */
/******************************************************************//**
Closes insert buffer and frees the data structures. */
UNIV_INTERN
diff --git a/storage/innobase/include/ibuf0ibuf.ic b/storage/innobase/include/ibuf0ibuf.ic
index 15bbe61ab30..84c7a004be2 100644
--- a/storage/innobase/include/ibuf0ibuf.ic
+++ b/storage/innobase/include/ibuf0ibuf.ic
@@ -55,10 +55,15 @@ struct ibuf_struct{
ulint height; /*!< tree height */
dict_index_t* index; /*!< insert buffer index */
- ulint n_inserts; /*!< number of inserts made to
- the insert buffer */
ulint n_merges; /*!< number of pages merged */
- ulint n_merged_recs; /*!< number of records merged */
+ ulint n_merged_ops[IBUF_OP_COUNT];
+ /*!< number of operations of each type
+ merged to index pages */
+ ulint n_discarded_ops[IBUF_OP_COUNT];
+ /*!< number of operations of each type
+ discarded without merging due to the
+ tablespace being deleted or the
+ index being dropped */
};
/************************************************************************//**
diff --git a/storage/innobase/include/lock0lock.h b/storage/innobase/include/lock0lock.h
index 82e4c9bd976..ad271a95654 100644
--- a/storage/innobase/include/lock0lock.h
+++ b/storage/innobase/include/lock0lock.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -613,13 +613,16 @@ lock_rec_print(
FILE* file, /*!< in: file where to print */
const lock_t* lock); /*!< in: record type lock */
/*********************************************************************//**
-Prints info of locks for all transactions. */
+Prints info of locks for all transactions.
+@return FALSE if not able to obtain kernel mutex
+and exits without printing info */
UNIV_INTERN
-void
+ibool
lock_print_info_summary(
/*====================*/
- FILE* file); /*!< in: file where to print */
-/*********************************************************************//**
+ FILE* file, /*!< in: file where to print */
+ ibool nowait);/*!< in: whether to wait for the kernel mutex */
+/*************************************************************************
Prints info of locks for each transaction. */
UNIV_INTERN
void
diff --git a/storage/innobase/include/log0log.h b/storage/innobase/include/log0log.h
index 135aeb69e2d..8fce4ef96bc 100644
--- a/storage/innobase/include/log0log.h
+++ b/storage/innobase/include/log0log.h
@@ -1,23 +1,6 @@
/*****************************************************************************
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
-
-This program is free software; you can redistribute it and/or modify it under
-the terms of the GNU General Public License as published by the Free Software
-Foundation; version 2 of the License.
-
-This program is distributed in the hope that it will be useful, but WITHOUT
-ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
-
-*****************************************************************************/
-/*****************************************************************************
-
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
Copyright (c) 2009, Google Inc.
Portions of this file contain modifications contributed and copyrighted by
@@ -825,7 +808,17 @@ struct log_struct{
written to some log group; for this to
be advanced, it is enough that the
write i/o has been completed for all
- log groups */
+ log groups.
+ Note that since InnoDB currently
+ has only one log group therefore
+ this value is redundant. Also it
+ is possible that this value
+ falls behind the
+ flushed_to_disk_lsn transiently.
+ It is appropriate to use either
+ flushed_to_disk_lsn or
+ write_lsn which are always
+ up-to-date and accurate. */
ib_uint64_t write_lsn; /*!< end lsn for the current running
write */
ulint write_end_offset;/*!< the data in buffer has
diff --git a/storage/innobase/include/log0log.ic b/storage/innobase/include/log0log.ic
index 36d151a3064..139f4041a36 100644
--- a/storage/innobase/include/log0log.ic
+++ b/storage/innobase/include/log0log.ic
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -314,12 +314,15 @@ log_reserve_and_write_fast(
ulint data_len;
#ifdef UNIV_LOG_LSN_DEBUG
/* length of the LSN pseudo-record */
- ulint lsn_len = 1
- + mach_get_compressed_size(log_sys->lsn >> 32)
- + mach_get_compressed_size(log_sys->lsn & 0xFFFFFFFFUL);
+ ulint lsn_len;
#endif /* UNIV_LOG_LSN_DEBUG */
mutex_enter(&log_sys->mutex);
+#ifdef UNIV_LOG_LSN_DEBUG
+ lsn_len = 1
+ + mach_get_compressed_size(log_sys->lsn >> 32)
+ + mach_get_compressed_size(log_sys->lsn & 0xFFFFFFFFUL);
+#endif /* UNIV_LOG_LSN_DEBUG */
data_len = len
#ifdef UNIV_LOG_LSN_DEBUG
diff --git a/storage/innobase/include/log0recv.h b/storage/innobase/include/log0recv.h
index a3d2bd050f5..3209799e140 100644
--- a/storage/innobase/include/log0recv.h
+++ b/storage/innobase/include/log0recv.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -176,6 +176,12 @@ UNIV_INTERN
void
recv_recovery_from_checkpoint_finish(void);
/*======================================*/
+/********************************************************//**
+Initiates the rollback of active transactions. */
+UNIV_INTERN
+void
+recv_recovery_rollback_active(void);
+/*===============================*/
/*******************************************************//**
Scans log from a buffer and stores new log data to the parsing buffer.
Parses and hashes the log records if new data found. Unless
@@ -258,12 +264,14 @@ void
recv_sys_init(
/*==========*/
ulint available_memory); /*!< in: available memory in bytes */
+#ifndef UNIV_HOTBACKUP
/********************************************************//**
Reset the state of the recovery system variables. */
UNIV_INTERN
void
recv_sys_var_init(void);
/*===================*/
+#endif /* !UNIV_HOTBACKUP */
/*******************************************************************//**
Empties the hash table of stored log records, applying them to appropriate
pages. */
diff --git a/storage/innobase/include/mem0dbg.h b/storage/innobase/include/mem0dbg.h
index a064af5c678..d81e1418b2b 100644
--- a/storage/innobase/include/mem0dbg.h
+++ b/storage/innobase/include/mem0dbg.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -28,6 +28,13 @@ Created 6/9/1994 Heikki Tuuri
check fields whose sizes are given below */
#ifdef UNIV_MEM_DEBUG
+# ifndef UNIV_HOTBACKUP
+/* The mutex which protects in the debug version the hash table
+containing the list of live memory heaps, and also the global
+variables in mem0dbg.c. */
+extern mutex_t mem_hash_mutex;
+# endif /* !UNIV_HOTBACKUP */
+
#define MEM_FIELD_HEADER_SIZE ut_calc_align(2 * sizeof(ulint),\
UNIV_MEM_ALIGNMENT)
#define MEM_FIELD_TRAILER_SIZE sizeof(ulint)
diff --git a/storage/innobase/include/mem0dbg.ic b/storage/innobase/include/mem0dbg.ic
index cb9245411dc..b0c8178a623 100644
--- a/storage/innobase/include/mem0dbg.ic
+++ b/storage/innobase/include/mem0dbg.ic
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -25,9 +25,6 @@ Created 6/8/1994 Heikki Tuuri
*************************************************************************/
#ifdef UNIV_MEM_DEBUG
-# ifndef UNIV_HOTBACKUP
-extern mutex_t mem_hash_mutex;
-# endif /* !UNIV_HOTBACKUP */
extern ulint mem_current_allocated_memory;
/******************************************************************//**
diff --git a/storage/innobase/include/mem0mem.h b/storage/innobase/include/mem0mem.h
index 98f8748e529..5181bb4c9f7 100644
--- a/storage/innobase/include/mem0mem.h
+++ b/storage/innobase/include/mem0mem.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -359,6 +359,9 @@ struct mem_block_info_struct {
to the heap is also the first block in this list,
though it also contains the base node of the list. */
ulint len; /*!< physical length of this block in bytes */
+ ulint total_size; /*!< physical length in bytes of all blocks
+ in the heap. This is defined only in the base
+ node and is set to ULINT_UNDEFINED in others. */
ulint type; /*!< type of heap: MEM_HEAP_DYNAMIC, or
MEM_HEAP_BUF possibly ORed to MEM_HEAP_BTR_SEARCH */
ulint free; /*!< offset in bytes of the first free position for
diff --git a/storage/innobase/include/mem0mem.ic b/storage/innobase/include/mem0mem.ic
index e7080d8c508..cbce2edc661 100644
--- a/storage/innobase/include/mem0mem.ic
+++ b/storage/innobase/include/mem0mem.ic
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -579,18 +579,12 @@ mem_heap_get_size(
/*==============*/
mem_heap_t* heap) /*!< in: heap */
{
- mem_block_t* block;
ulint size = 0;
ut_ad(mem_heap_check(heap));
- block = heap;
-
- while (block != NULL) {
+ size = heap->total_size;
- size += mem_block_get_len(block);
- block = UT_LIST_GET_NEXT(list, block);
- }
#ifndef UNIV_HOTBACKUP
if (heap->free_block) {
size += UNIV_PAGE_SIZE;
diff --git a/storage/innobase/include/mtr0mtr.ic b/storage/innobase/include/mtr0mtr.ic
index 310c7c4117f..18f8e87b3cf 100644
--- a/storage/innobase/include/mtr0mtr.ic
+++ b/storage/innobase/include/mtr0mtr.ic
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -70,6 +70,7 @@ mtr_memo_push(
ut_ad(type <= MTR_MEMO_X_LOCK);
ut_ad(mtr);
ut_ad(mtr->magic_n == MTR_MAGIC_N);
+ ut_ad(mtr->state == MTR_ACTIVE);
memo = &(mtr->memo);
@@ -92,6 +93,7 @@ mtr_set_savepoint(
ut_ad(mtr);
ut_ad(mtr->magic_n == MTR_MAGIC_N);
+ ut_ad(mtr->state == MTR_ACTIVE);
memo = &(mtr->memo);
@@ -149,6 +151,7 @@ mtr_memo_contains(
ut_ad(mtr);
ut_ad(mtr->magic_n == MTR_MAGIC_N);
+ ut_ad(mtr->state == MTR_ACTIVE || mtr->state == MTR_COMMITTING);
memo = &(mtr->memo);
diff --git a/storage/innobase/include/os0file.h b/storage/innobase/include/os0file.h
index 16568579f31..a112cb06697 100644
--- a/storage/innobase/include/os0file.h
+++ b/storage/innobase/include/os0file.h
@@ -1,23 +1,6 @@
-/*****************************************************************************
-
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
-
-This program is free software; you can redistribute it and/or modify it under
-the terms of the GNU General Public License as published by the Free Software
-Foundation; version 2 of the License.
-
-This program is distributed in the hope that it will be useful, but WITHOUT
-ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
-
-*****************************************************************************/
/***********************************************************************
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
Copyright (c) 2009, Percona Inc.
Portions of this file contain modifications contributed and copyrighted
@@ -93,29 +76,23 @@ extern ulint os_n_pending_writes;
#ifdef __WIN__
/** File handle */
-#define os_file_t HANDLE
+# define os_file_t HANDLE
/** Convert a C file descriptor to a native file handle
@param fd file descriptor
@return native file handle */
-#define OS_FILE_FROM_FD(fd) (HANDLE) _get_osfhandle(fd)
+# define OS_FILE_FROM_FD(fd) (HANDLE) _get_osfhandle(fd)
#else
/** File handle */
typedef int os_file_t;
/** Convert a C file descriptor to a native file handle
@param fd file descriptor
@return native file handle */
-#define OS_FILE_FROM_FD(fd) fd
+# define OS_FILE_FROM_FD(fd) fd
#endif
/** Umask for creating files */
extern ulint os_innodb_umask;
-/** If this flag is TRUE, then we will use the native aio of the
-OS (provided we compiled Innobase with it in), otherwise we will
-use simulated aio we build below with threads */
-
-extern ibool os_aio_use_native_aio;
-
/** The next value should be smaller or equal to the smallest sector size used
on any disk. A log block is required to be a portion of disk which is written
so that if the start and the end of a block get written to disk, then the
@@ -158,7 +135,8 @@ log. */
#define OS_FILE_SHARING_VIOLATION 76
#define OS_FILE_ERROR_NOT_SPECIFIED 77
#define OS_FILE_INSUFFICIENT_RESOURCE 78
-#define OS_FILE_OPERATION_ABORTED 79
+#define OS_FILE_AIO_INTERRUPTED 79
+#define OS_FILE_OPERATION_ABORTED 80
/* @} */
/** Types for aio operations @{ */
@@ -204,6 +182,157 @@ extern ulint os_n_file_reads;
extern ulint os_n_file_writes;
extern ulint os_n_fsyncs;
+#ifdef UNIV_PFS_IO
+/* Keys to register InnoDB I/O with performance schema */
+extern mysql_pfs_key_t innodb_file_data_key;
+extern mysql_pfs_key_t innodb_file_log_key;
+extern mysql_pfs_key_t innodb_file_temp_key;
+
+/* Following four macros are instumentations to register
+various file I/O operations with performance schema.
+1) register_pfs_file_open_begin() and register_pfs_file_open_end() are
+used to register file creation, opening, closing and renaming.
+2) register_pfs_file_io_begin() and register_pfs_file_io_end() are
+used to register actual file read, write and flush */
+# define register_pfs_file_open_begin(locker, key, op, name, \
+ src_file, src_line) \
+do { \
+ if (PSI_server) { \
+ locker = PSI_server->get_thread_file_name_locker( \
+ key, op, name, &locker); \
+ if (locker) { \
+ PSI_server->start_file_open_wait( \
+ locker, src_file, src_line); \
+ } \
+ } \
+} while (0)
+
+# define register_pfs_file_open_end(locker, file) \
+do { \
+ if (locker) { \
+ PSI_server->end_file_open_wait_and_bind_to_descriptor( \
+ locker, file); \
+ } \
+} while (0)
+
+# define register_pfs_file_io_begin(locker, file, count, op, \
+ src_file, src_line) \
+do { \
+ if (PSI_server) { \
+ locker = PSI_server->get_thread_file_descriptor_locker( \
+ file, op); \
+ if (locker) { \
+ PSI_server->start_file_wait( \
+ locker, count, src_file, src_line); \
+ } \
+ } \
+} while (0)
+
+# define register_pfs_file_io_end(locker, count) \
+do { \
+ if (locker) { \
+ PSI_server->end_file_wait(locker, count); \
+ } \
+} while (0)
+#endif /* UNIV_PFS_IO */
+
+/* Following macros/functions are file I/O APIs that would be performance
+schema instrumented if "UNIV_PFS_IO" is defined. They would point to
+wrapper functions with performance schema instrumentation in such case.
+
+os_file_create
+os_file_create_simple
+os_file_create_simple_no_error_handling
+os_file_close
+os_file_rename
+os_aio
+os_file_read
+os_file_read_no_error_handling
+os_file_write
+
+The wrapper functions have the prefix of "innodb_". */
+
+#ifdef UNIV_PFS_IO
+# define os_file_create(key, name, create, purpose, type, success) \
+ pfs_os_file_create_func(key, name, create, purpose, type, \
+ success, __FILE__, __LINE__)
+
+# define os_file_create_simple(key, name, create, access, success) \
+ pfs_os_file_create_simple_func(key, name, create, access, \
+ success, __FILE__, __LINE__)
+
+# define os_file_create_simple_no_error_handling( \
+ key, name, create_mode, access, success) \
+ pfs_os_file_create_simple_no_error_handling_func( \
+ key, name, create_mode, access, success, __FILE__, __LINE__)
+
+# define os_file_close(file) \
+ pfs_os_file_close_func(file, __FILE__, __LINE__)
+
+# define os_aio(type, mode, name, file, buf, offset, offset_high, \
+ n, message1, message2) \
+ pfs_os_aio_func(type, mode, name, file, buf, offset, \
+ offset_high, n, message1, message2, \
+ __FILE__, __LINE__)
+
+# define os_file_read(file, buf, offset, offset_high, n) \
+ pfs_os_file_read_func(file, buf, offset, offset_high, n, \
+ __FILE__, __LINE__)
+
+# define os_file_read_no_error_handling(file, buf, offset, \
+ offset_high, n) \
+ pfs_os_file_read_no_error_handling_func(file, buf, offset, \
+ offset_high, n, \
+ __FILE__, __LINE__)
+
+# define os_file_write(name, file, buf, offset, offset_high, n) \
+ pfs_os_file_write_func(name, file, buf, offset, offset_high, \
+ n, __FILE__, __LINE__)
+
+# define os_file_flush(file) \
+ pfs_os_file_flush_func(file, __FILE__, __LINE__)
+
+# define os_file_rename(key, oldpath, newpath) \
+ pfs_os_file_rename_func(key, oldpath, newpath, __FILE__, __LINE__)
+#else /* UNIV_PFS_IO */
+
+/* If UNIV_PFS_IO is not defined, these I/O APIs point
+to original un-instrumented file I/O APIs */
+# define os_file_create(key, name, create, purpose, type, success) \
+ os_file_create_func(name, create, purpose, type, success)
+
+# define os_file_create_simple(key, name, create, access, success) \
+ os_file_create_simple_func(name, create_mode, access, success)
+
+# define os_file_create_simple_no_error_handling( \
+ key, name, create_mode, access, success) \
+ os_file_create_simple_no_error_handling_func( \
+ name, create_mode, access, success)
+
+# define os_file_close(file) os_file_close_func(file)
+
+# define os_aio(type, mode, name, file, buf, offset, offset_high, \
+ n, message1, message2) \
+ os_aio_func(type, mode, name, file, buf, offset, offset_high, n,\
+ message1, message2)
+
+# define os_file_read(file, buf, offset, offset_high, n) \
+ os_file_read_func(file, buf, offset, offset_high, n)
+
+# define os_file_read_no_error_handling(file, buf, offset, \
+ offset_high, n) \
+ os_file_read_no_error_handling_func(file, buf, offset, offset_high, n)
+
+# define os_file_write(name, file, buf, offset, offset_high, n) \
+ os_file_write_func(name, file, buf, offset, offset_high, n)
+
+# define os_file_flush(file) os_file_flush_func(file)
+
+# define os_file_rename(key, oldpath, newpath) \
+ os_file_rename_func(oldpath, newpath)
+
+#endif /* UNIV_PFS_IO */
+
/* File types for directory entry data type */
enum os_file_type_enum{
@@ -313,13 +442,15 @@ os_file_create_directory(
ibool fail_if_exists);/*!< in: if TRUE, pre-existing directory
is treated as an error. */
/****************************************************************//**
+NOTE! Use the corresponding macro os_file_create_simple(), not directly
+this function!
A simple function to open or create a file.
@return own: handle to the file, not defined if error, error number
can be retrieved with os_file_get_last_error */
UNIV_INTERN
os_file_t
-os_file_create_simple(
-/*==================*/
+os_file_create_simple_func(
+/*=======================*/
const char* name, /*!< in: name of the file or path as a
null-terminated string */
ulint create_mode,/*!< in: OS_FILE_OPEN if an existing file is
@@ -333,13 +464,15 @@ os_file_create_simple(
OS_FILE_READ_WRITE */
ibool* success);/*!< out: TRUE if succeed, FALSE if error */
/****************************************************************//**
+NOTE! Use the corresponding macro
+os_file_create_simple_no_error_handling(), not directly this function!
A simple function to open or create a file.
@return own: handle to the file, not defined if error, error number
can be retrieved with os_file_get_last_error */
UNIV_INTERN
os_file_t
-os_file_create_simple_no_error_handling(
-/*====================================*/
+os_file_create_simple_no_error_handling_func(
+/*=========================================*/
const char* name, /*!< in: name of the file or path as a
null-terminated string */
ulint create_mode,/*!< in: OS_FILE_OPEN if an existing file
@@ -363,13 +496,15 @@ os_file_set_nocache(
const char* operation_name);/*!< in: "open" or "create"; used in the
diagnostic message */
/****************************************************************//**
+NOTE! Use the corresponding macro os_file_create(), not directly
+this function!
Opens an existing file or creates a new.
@return own: handle to the file, not defined if error, error number
can be retrieved with os_file_get_last_error */
UNIV_INTERN
os_file_t
-os_file_create(
-/*===========*/
+os_file_create_func(
+/*================*/
const char* name, /*!< in: name of the file or path as a
null-terminated string */
ulint create_mode,/*!< in: OS_FILE_OPEN if an existing file
@@ -407,25 +542,258 @@ os_file_delete_if_exists(
/*=====================*/
const char* name); /*!< in: file path as a null-terminated string */
/***********************************************************************//**
+NOTE! Use the corresponding macro os_file_rename(), not directly
+this function!
Renames a file (can also move it to another directory). It is safest that the
file is closed before calling this function.
@return TRUE if success */
UNIV_INTERN
ibool
-os_file_rename(
-/*===========*/
+os_file_rename_func(
+/*================*/
const char* oldpath, /*!< in: old file path as a
null-terminated string */
const char* newpath); /*!< in: new file path */
/***********************************************************************//**
+NOTE! Use the corresponding macro os_file_close(), not directly this
+function!
Closes a file handle. In case of error, error number can be retrieved with
os_file_get_last_error.
@return TRUE if success */
UNIV_INTERN
ibool
-os_file_close(
-/*==========*/
+os_file_close_func(
+/*===============*/
os_file_t file); /*!< in, own: handle to a file */
+
+#ifdef UNIV_PFS_IO
+/****************************************************************//**
+NOTE! Please use the corresponding macro os_file_create_simple(),
+not directly this function!
+A performance schema instrumented wrapper function for
+os_file_create_simple() which opens or creates a file.
+@return own: handle to the file, not defined if error, error number
+can be retrieved with os_file_get_last_error */
+UNIV_INLINE
+os_file_t
+pfs_os_file_create_simple_func(
+/*===========================*/
+ mysql_pfs_key_t key, /*!< in: Performance Schema Key */
+ const char* name, /*!< in: name of the file or path as a
+ null-terminated string */
+ ulint create_mode,/*!< in: OS_FILE_OPEN if an existing file is
+ opened (if does not exist, error), or
+ OS_FILE_CREATE if a new file is created
+ (if exists, error), or
+ OS_FILE_CREATE_PATH if new file
+ (if exists, error) and subdirectories along
+ its path are created (if needed)*/
+ ulint access_type,/*!< in: OS_FILE_READ_ONLY or
+ OS_FILE_READ_WRITE */
+ ibool* success,/*!< out: TRUE if succeed, FALSE if error */
+ const char* src_file,/*!< in: file name where func invoked */
+ ulint src_line);/*!< in: line where the func invoked */
+
+/****************************************************************//**
+NOTE! Please use the corresponding macro
+os_file_create_simple_no_error_handling(), not directly this function!
+A performance schema instrumented wrapper function for
+os_file_create_simple_no_error_handling(). Add instrumentation to
+monitor file creation/open.
+@return own: handle to the file, not defined if error, error number
+can be retrieved with os_file_get_last_error */
+UNIV_INLINE
+os_file_t
+pfs_os_file_create_simple_no_error_handling_func(
+/*=============================================*/
+ mysql_pfs_key_t key, /*!< in: Performance Schema Key */
+ const char* name, /*!< in: name of the file or path as a
+ null-terminated string */
+ ulint create_mode,/*!< in: OS_FILE_OPEN if an existing file
+ is opened (if does not exist, error), or
+ OS_FILE_CREATE if a new file is created
+ (if exists, error) */
+ ulint access_type,/*!< in: OS_FILE_READ_ONLY,
+ OS_FILE_READ_WRITE, or
+ OS_FILE_READ_ALLOW_DELETE; the last option is
+ used by a backup program reading the file */
+ ibool* success,/*!< out: TRUE if succeed, FALSE if error */
+ const char* src_file,/*!< in: file name where func invoked */
+ ulint src_line);/*!< in: line where the func invoked */
+
+/****************************************************************//**
+NOTE! Please use the corresponding macro os_file_create(), not directly
+this function!
+A performance schema wrapper function for os_file_create().
+Add instrumentation to monitor file creation/open.
+@return own: handle to the file, not defined if error, error number
+can be retrieved with os_file_get_last_error */
+UNIV_INLINE
+os_file_t
+pfs_os_file_create_func(
+/*====================*/
+ mysql_pfs_key_t key, /*!< in: Performance Schema Key */
+ const char* name, /*!< in: name of the file or path as a
+ null-terminated string */
+ ulint create_mode,/*!< in: OS_FILE_OPEN if an existing file
+ is opened (if does not exist, error), or
+ OS_FILE_CREATE if a new file is created
+ (if exists, error),
+ OS_FILE_OVERWRITE if a new file is created
+ or an old overwritten;
+ OS_FILE_OPEN_RAW, if a raw device or disk
+ partition should be opened */
+ ulint purpose,/*!< in: OS_FILE_AIO, if asynchronous,
+ non-buffered i/o is desired,
+ OS_FILE_NORMAL, if any normal file;
+ NOTE that it also depends on type, os_aio_..
+ and srv_.. variables whether we really use
+ async i/o or unbuffered i/o: look in the
+ function source code for the exact rules */
+ ulint type, /*!< in: OS_DATA_FILE or OS_LOG_FILE */
+ ibool* success,/*!< out: TRUE if succeed, FALSE if error */
+ const char* src_file,/*!< in: file name where func invoked */
+ ulint src_line);/*!< in: line where the func invoked */
+
+/***********************************************************************//**
+NOTE! Please use the corresponding macro os_file_close(), not directly
+this function!
+A performance schema instrumented wrapper function for os_file_close().
+@return TRUE if success */
+UNIV_INLINE
+ibool
+pfs_os_file_close_func(
+/*===================*/
+ os_file_t file, /*!< in, own: handle to a file */
+ const char* src_file,/*!< in: file name where func invoked */
+ ulint src_line);/*!< in: line where the func invoked */
+/*******************************************************************//**
+NOTE! Please use the corresponding macro os_file_read(), not directly
+this function!
+This is the performance schema instrumented wrapper function for
+os_file_read() which requests a synchronous read operation.
+@return TRUE if request was successful, FALSE if fail */
+UNIV_INLINE
+ibool
+pfs_os_file_read_func(
+/*==================*/
+ os_file_t file, /*!< in: handle to a file */
+ void* buf, /*!< in: buffer where to read */
+ ulint offset, /*!< in: least significant 32 bits of file
+ offset where to read */
+ ulint offset_high,/*!< in: most significant 32 bits of
+ offset */
+ ulint n, /*!< in: number of bytes to read */
+ const char* src_file,/*!< in: file name where func invoked */
+ ulint src_line);/*!< in: line where the func invoked */
+
+/*******************************************************************//**
+NOTE! Please use the corresponding macro os_file_read_no_error_handling(),
+not directly this function!
+This is the performance schema instrumented wrapper function for
+os_file_read_no_error_handling_func() which requests a synchronous
+read operation.
+@return TRUE if request was successful, FALSE if fail */
+UNIV_INLINE
+ibool
+pfs_os_file_read_no_error_handling_func(
+/*====================================*/
+ os_file_t file, /*!< in: handle to a file */
+ void* buf, /*!< in: buffer where to read */
+ ulint offset, /*!< in: least significant 32 bits of file
+ offset where to read */
+ ulint offset_high,/*!< in: most significant 32 bits of
+ offset */
+ ulint n, /*!< in: number of bytes to read */
+ const char* src_file,/*!< in: file name where func invoked */
+ ulint src_line);/*!< in: line where the func invoked */
+
+/*******************************************************************//**
+NOTE! Please use the corresponding macro os_aio(), not directly this
+function!
+Performance schema wrapper function of os_aio() which requests
+an asynchronous i/o operation.
+@return TRUE if request was queued successfully, FALSE if fail */
+UNIV_INLINE
+ibool
+pfs_os_aio_func(
+/*============*/
+ ulint type, /*!< in: OS_FILE_READ or OS_FILE_WRITE */
+ ulint mode, /*!< in: OS_AIO_NORMAL etc. I/O mode */
+ const char* name, /*!< in: name of the file or path as a
+ null-terminated string */
+ os_file_t file, /*!< in: handle to a file */
+ void* buf, /*!< in: buffer where to read or from which
+ to write */
+ ulint offset, /*!< in: least significant 32 bits of file
+ offset where to read or write */
+ ulint offset_high,/*!< in: most significant 32 bits of
+ offset */
+ ulint n, /*!< in: number of bytes to read or write */
+ fil_node_t* message1,/*!< in: message for the aio handler
+ (can be used to identify a completed
+ aio operation); ignored if mode is
+ OS_AIO_SYNC */
+ void* message2,/*!< in: message for the aio handler
+ (can be used to identify a completed
+ aio operation); ignored if mode is
+ OS_AIO_SYNC */
+ const char* src_file,/*!< in: file name where func invoked */
+ ulint src_line);/*!< in: line where the func invoked */
+/*******************************************************************//**
+NOTE! Please use the corresponding macro os_file_write(), not directly
+this function!
+This is the performance schema instrumented wrapper function for
+os_file_write() which requests a synchronous write operation.
+@return TRUE if request was successful, FALSE if fail */
+UNIV_INLINE
+ibool
+pfs_os_file_write_func(
+/*===================*/
+ const char* name, /*!< in: name of the file or path as a
+ null-terminated string */
+ os_file_t file, /*!< in: handle to a file */
+ const void* buf, /*!< in: buffer from which to write */
+ ulint offset, /*!< in: least significant 32 bits of file
+ offset where to write */
+ ulint offset_high,/*!< in: most significant 32 bits of
+ offset */
+ ulint n, /*!< in: number of bytes to write */
+ const char* src_file,/*!< in: file name where func invoked */
+ ulint src_line);/*!< in: line where the func invoked */
+/***********************************************************************//**
+NOTE! Please use the corresponding macro os_file_flush(), not directly
+this function!
+This is the performance schema instrumented wrapper function for
+os_file_flush() which flushes the write buffers of a given file to the disk.
+Flushes the write buffers of a given file to the disk.
+@return TRUE if success */
+UNIV_INLINE
+ibool
+pfs_os_file_flush_func(
+/*===================*/
+ os_file_t file, /*!< in, own: handle to a file */
+ const char* src_file,/*!< in: file name where func invoked */
+ ulint src_line);/*!< in: line where the func invoked */
+
+/***********************************************************************//**
+NOTE! Please use the corresponding macro os_file_rename(), not directly
+this function!
+This is the performance schema instrumented wrapper function for
+os_file_rename()
+@return TRUE if success */
+UNIV_INLINE
+ibool
+pfs_os_file_rename_func(
+/*====================*/
+ mysql_pfs_key_t key, /*!< in: Performance Schema Key */
+ const char* oldpath,/*!< in: old file path as a null-terminated
+ string */
+ const char* newpath,/*!< in: new file path */
+ const char* src_file,/*!< in: file name where func invoked */
+ ulint src_line);/*!< in: line where the func invoked */
+#endif /* UNIV_PFS_IO */
+
#ifdef UNIV_HOTBACKUP
/***********************************************************************//**
Closes a file handle.
@@ -477,12 +845,13 @@ os_file_set_eof(
/*============*/
FILE* file); /*!< in: file to be truncated */
/***********************************************************************//**
+NOTE! Use the corresponding macro os_file_flush(), not directly this function!
Flushes the write buffers of a given file to the disk.
@return TRUE if success */
UNIV_INTERN
ibool
-os_file_flush(
-/*==========*/
+os_file_flush_func(
+/*===============*/
os_file_t file); /*!< in, own: handle to a file */
/***********************************************************************//**
Retrieves the last error number if an error occurs in a file io function.
@@ -497,12 +866,13 @@ os_file_get_last_error(
ibool report_all_errors); /*!< in: TRUE if we want an error message
printed of all errors */
/*******************************************************************//**
+NOTE! Use the corresponding macro os_file_read(), not directly this function!
Requests a synchronous read operation.
@return TRUE if request was successful, FALSE if fail */
UNIV_INTERN
ibool
-os_file_read(
-/*=========*/
+os_file_read_func(
+/*==============*/
os_file_t file, /*!< in: handle to a file */
void* buf, /*!< in: buffer where to read */
ulint offset, /*!< in: least significant 32 bits of file
@@ -522,13 +892,15 @@ os_file_read_string(
char* str, /*!< in: buffer where to read */
ulint size); /*!< in: size of buffer */
/*******************************************************************//**
+NOTE! Use the corresponding macro os_file_read_no_error_handling(),
+not directly this function!
Requests a synchronous positioned read operation. This function does not do
any error handling. In case of error it returns FALSE.
@return TRUE if request was successful, FALSE if fail */
UNIV_INTERN
ibool
-os_file_read_no_error_handling(
-/*===========================*/
+os_file_read_no_error_handling_func(
+/*================================*/
os_file_t file, /*!< in: handle to a file */
void* buf, /*!< in: buffer where to read */
ulint offset, /*!< in: least significant 32 bits of file
@@ -538,12 +910,14 @@ os_file_read_no_error_handling(
ulint n); /*!< in: number of bytes to read */
/*******************************************************************//**
+NOTE! Use the corresponding macro os_file_write(), not directly this
+function!
Requests a synchronous write operation.
@return TRUE if request was successful, FALSE if fail */
UNIV_INTERN
ibool
-os_file_write(
-/*==========*/
+os_file_write_func(
+/*===============*/
const char* name, /*!< in: name of the file or path as a
null-terminated string */
os_file_t file, /*!< in: handle to a file */
@@ -612,7 +986,7 @@ respectively. The caller must create an i/o handler thread for each
segment in these arrays. This function also creates the sync array.
No i/o handler thread needs to be created for that */
UNIV_INTERN
-void
+ibool
os_aio_init(
/*========*/
ulint n_per_seg, /*<! in: maximum number of pending aio
@@ -629,12 +1003,13 @@ os_aio_free(void);
/*=============*/
/*******************************************************************//**
+NOTE! Use the corresponding macro os_aio(), not directly this function!
Requests an asynchronous i/o operation.
@return TRUE if request was queued successfully, FALSE if fail */
UNIV_INTERN
ibool
-os_aio(
-/*===*/
+os_aio_func(
+/*========*/
ulint type, /*!< in: OS_FILE_READ or OS_FILE_WRITE */
ulint mode, /*!< in: OS_AIO_NORMAL, ..., possibly ORed
to OS_AIO_SIMULATED_WAKE_LATER: the
@@ -802,4 +1177,36 @@ innobase_mysql_tmpfile(void);
/*========================*/
#endif /* !UNIV_HOTBACKUP && !__NETWARE__ */
+
+#if defined(LINUX_NATIVE_AIO)
+/**************************************************************************
+This function is only used in Linux native asynchronous i/o.
+Waits for an aio operation to complete. This function is used to wait the
+for completed requests. The aio array of pending requests is divided
+into segments. The thread specifies which segment or slot it wants to wait
+for. NOTE: this function will also take care of freeing the aio slot,
+therefore no other thread is allowed to do the freeing!
+@return TRUE if the IO was successful */
+UNIV_INTERN
+ibool
+os_aio_linux_handle(
+/*================*/
+ ulint global_seg, /*!< in: segment number in the aio array
+ to wait for; segment 0 is the ibuf
+ i/o thread, segment 1 is log i/o thread,
+ then follow the non-ibuf read threads,
+ and the last are the non-ibuf write
+ threads. */
+ fil_node_t**message1, /*!< out: the messages passed with the */
+ void** message2, /*!< aio request; note that in case the
+ aio operation failed, these output
+ parameters are valid and can be used to
+ restart the operation. */
+ ulint* type); /*!< out: OS_FILE_WRITE or ..._READ */
+#endif /* LINUX_NATIVE_AIO */
+
+#ifndef UNIV_NONINL
+#include "os0file.ic"
+#endif
+
#endif
diff --git a/storage/innobase/include/os0file.ic b/storage/innobase/include/os0file.ic
new file mode 100644
index 00000000000..32f0e7cf666
--- /dev/null
+++ b/storage/innobase/include/os0file.ic
@@ -0,0 +1,408 @@
+/*****************************************************************************
+
+Copyright (c) 2010, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/os0file.ic
+The interface to the operating system file io
+
+Created 2/20/2010 Jimmy Yang
+*******************************************************/
+
+#include "univ.i"
+
+#ifdef UNIV_PFS_IO
+/****************************************************************//**
+NOTE! Please use the corresponding macro os_file_create_simple(),
+not directly this function!
+A performance schema instrumented wrapper function for
+os_file_create_simple() which opens or creates a file.
+@return own: handle to the file, not defined if error, error number
+can be retrieved with os_file_get_last_error */
+UNIV_INLINE
+os_file_t
+pfs_os_file_create_simple_func(
+/*===========================*/
+ mysql_pfs_key_t key, /*!< in: Performance Schema Key */
+ const char* name, /*!< in: name of the file or path as a
+ null-terminated string */
+ ulint create_mode,/*!< in: OS_FILE_OPEN if an existing file is
+ opened (if does not exist, error), or
+ OS_FILE_CREATE if a new file is created
+ (if exists, error), or
+ OS_FILE_CREATE_PATH if new file
+ (if exists, error) and subdirectories along
+ its path are created (if needed)*/
+ ulint access_type,/*!< in: OS_FILE_READ_ONLY or
+ OS_FILE_READ_WRITE */
+ ibool* success,/*!< out: TRUE if succeed, FALSE if error */
+ const char* src_file,/*!< in: file name where func invoked */
+ ulint src_line)/*!< in: line where the func invoked */
+{
+ os_file_t file;
+ struct PSI_file_locker* locker = NULL;
+
+ /* register a file open or creation depending on "create_mode" */
+ register_pfs_file_open_begin(locker, key,
+ ((create_mode == OS_FILE_CREATE)
+ ? PSI_FILE_CREATE
+ : PSI_FILE_OPEN),
+ name, src_file, src_line);
+
+ file = os_file_create_simple_func(name, create_mode,
+ access_type, success);
+
+ /* Regsiter the returning "file" value with the system */
+ register_pfs_file_open_end(locker, file);
+
+ return(file);
+}
+
+/****************************************************************//**
+NOTE! Please use the corresponding macro
+os_file_create_simple_no_error_handling(), not directly this function!
+A performance schema instrumented wrapper function for
+os_file_create_simple_no_error_handling(). Add instrumentation to
+monitor file creation/open.
+@return own: handle to the file, not defined if error, error number
+can be retrieved with os_file_get_last_error */
+UNIV_INLINE
+os_file_t
+pfs_os_file_create_simple_no_error_handling_func(
+/*=============================================*/
+ mysql_pfs_key_t key, /*!< in: Performance Schema Key */
+ const char* name, /*!< in: name of the file or path as a
+ null-terminated string */
+ ulint create_mode,/*!< in: OS_FILE_OPEN if an existing file
+ is opened (if does not exist, error), or
+ OS_FILE_CREATE if a new file is created
+ (if exists, error) */
+ ulint access_type,/*!< in: OS_FILE_READ_ONLY,
+ OS_FILE_READ_WRITE, or
+ OS_FILE_READ_ALLOW_DELETE; the last option is
+ used by a backup program reading the file */
+ ibool* success,/*!< out: TRUE if succeed, FALSE if error */
+ const char* src_file,/*!< in: file name where func invoked */
+ ulint src_line)/*!< in: line where the func invoked */
+{
+ os_file_t file;
+ struct PSI_file_locker* locker = NULL;
+
+ /* register a file open or creation depending on "create_mode" */
+ register_pfs_file_open_begin(locker, key,
+ ((create_mode == OS_FILE_CREATE)
+ ? PSI_FILE_CREATE
+ : PSI_FILE_OPEN),
+ name, src_file, src_line);
+
+ file = os_file_create_simple_no_error_handling_func(
+ name, create_mode, access_type, success);
+
+ register_pfs_file_open_end(locker, file);
+
+ return(file);
+}
+
+/****************************************************************//**
+NOTE! Please use the corresponding macro os_file_create(), not directly
+this function!
+A performance schema wrapper function for os_file_create().
+Add instrumentation to monitor file creation/open.
+@return own: handle to the file, not defined if error, error number
+can be retrieved with os_file_get_last_error */
+UNIV_INLINE
+os_file_t
+pfs_os_file_create_func(
+/*====================*/
+ mysql_pfs_key_t key, /*!< in: Performance Schema Key */
+ const char* name, /*!< in: name of the file or path as a
+ null-terminated string */
+ ulint create_mode,/*!< in: OS_FILE_OPEN if an existing file
+ is opened (if does not exist, error), or
+ OS_FILE_CREATE if a new file is created
+ (if exists, error),
+ OS_FILE_OVERWRITE if a new file is created
+ or an old overwritten;
+ OS_FILE_OPEN_RAW, if a raw device or disk
+ partition should be opened */
+ ulint purpose,/*!< in: OS_FILE_AIO, if asynchronous,
+ non-buffered i/o is desired,
+ OS_FILE_NORMAL, if any normal file;
+ NOTE that it also depends on type, os_aio_..
+ and srv_.. variables whether we really use
+ async i/o or unbuffered i/o: look in the
+ function source code for the exact rules */
+ ulint type, /*!< in: OS_DATA_FILE or OS_LOG_FILE */
+ ibool* success,/*!< out: TRUE if succeed, FALSE if error */
+ const char* src_file,/*!< in: file name where func invoked */
+ ulint src_line)/*!< in: line where the func invoked */
+{
+ os_file_t file;
+ struct PSI_file_locker* locker = NULL;
+
+ /* register a file open or creation depending on "create_mode" */
+ register_pfs_file_open_begin(locker, key,
+ ((create_mode == OS_FILE_CREATE)
+ ? PSI_FILE_CREATE
+ : PSI_FILE_OPEN),
+ name, src_file, src_line);
+
+ file = os_file_create_func(name, create_mode, purpose, type, success);
+
+ register_pfs_file_open_end(locker, file);
+
+ return(file);
+}
+
+/***********************************************************************//**
+NOTE! Please use the corresponding macro os_file_close(), not directly
+this function!
+A performance schema instrumented wrapper function for os_file_close().
+@return TRUE if success */
+UNIV_INLINE
+ibool
+pfs_os_file_close_func(
+/*===================*/
+ os_file_t file, /*!< in, own: handle to a file */
+ const char* src_file,/*!< in: file name where func invoked */
+ ulint src_line)/*!< in: line where the func invoked */
+{
+ ibool result;
+ struct PSI_file_locker* locker = NULL;
+
+ /* register the file close */
+ register_pfs_file_io_begin(locker, file, 0, PSI_FILE_CLOSE,
+ src_file, src_line);
+
+ result = os_file_close_func(file);
+
+ register_pfs_file_io_end(locker, 0);
+
+ return(result);
+}
+
+/*******************************************************************//**
+NOTE! Please use the corresponding macro os_aio(), not directly this
+function!
+Performance schema instrumented wrapper function of os_aio() which
+requests an asynchronous i/o operation.
+@return TRUE if request was queued successfully, FALSE if fail */
+UNIV_INLINE
+ibool
+pfs_os_aio_func(
+/*============*/
+ ulint type, /*!< in: OS_FILE_READ or OS_FILE_WRITE */
+ ulint mode, /*!< in: OS_AIO_NORMAL etc. I/O mode */
+ const char* name, /*!< in: name of the file or path as a
+ null-terminated string */
+ os_file_t file, /*!< in: handle to a file */
+ void* buf, /*!< in: buffer where to read or from which
+ to write */
+ ulint offset, /*!< in: least significant 32 bits of file
+ offset where to read or write */
+ ulint offset_high,/*!< in: most significant 32 bits of
+ offset */
+ ulint n, /*!< in: number of bytes to read or write */
+ fil_node_t* message1,/*!< in: message for the aio handler
+ (can be used to identify a completed
+ aio operation); ignored if mode is
+ OS_AIO_SYNC */
+ void* message2,/*!< in: message for the aio handler
+ (can be used to identify a completed
+ aio operation); ignored if mode is
+ OS_AIO_SYNC */
+ const char* src_file,/*!< in: file name where func invoked */
+ ulint src_line)/*!< in: line where the func invoked */
+{
+ ibool result;
+ struct PSI_file_locker* locker = NULL;
+
+ /* Register the read or write I/O depending on "type" */
+ register_pfs_file_io_begin(locker, file, n,
+ (type == OS_FILE_WRITE)
+ ? PSI_FILE_WRITE
+ : PSI_FILE_READ,
+ src_file, src_line);
+
+ result = os_aio_func(type, mode, name, file, buf, offset, offset_high,
+ n, message1, message2);
+
+ register_pfs_file_io_end(locker, n);
+
+ return(result);
+}
+
+/*******************************************************************//**
+NOTE! Please use the corresponding macro os_file_read(), not directly
+this function!
+This is the performance schema instrumented wrapper function for
+os_file_read() which requests a synchronous read operation.
+@return TRUE if request was successful, FALSE if fail */
+UNIV_INLINE
+ibool
+pfs_os_file_read_func(
+/*==================*/
+ os_file_t file, /*!< in: handle to a file */
+ void* buf, /*!< in: buffer where to read */
+ ulint offset, /*!< in: least significant 32 bits of file
+ offset where to read */
+ ulint offset_high,/*!< in: most significant 32 bits of
+ offset */
+ ulint n, /*!< in: number of bytes to read */
+ const char* src_file,/*!< in: file name where func invoked */
+ ulint src_line)/*!< in: line where the func invoked */
+{
+ ibool result;
+ struct PSI_file_locker* locker = NULL;
+
+ register_pfs_file_io_begin(locker, file, n, PSI_FILE_READ,
+ src_file, src_line);
+
+ result = os_file_read_func(file, buf, offset, offset_high, n);
+
+ register_pfs_file_io_end(locker, n);
+
+ return(result);
+}
+
+/*******************************************************************//**
+NOTE! Please use the corresponding macro
+os_file_read_no_error_handling(), not directly this function!
+This is the performance schema instrumented wrapper function for
+os_file_read_no_error_handling() which requests a synchronous
+positioned read operation. This function does not do any error
+handling. In case of error it returns FALSE.
+@return TRUE if request was successful, FALSE if fail */
+UNIV_INLINE
+ibool
+pfs_os_file_read_no_error_handling_func(
+/*====================================*/
+ os_file_t file, /*!< in: handle to a file */
+ void* buf, /*!< in: buffer where to read */
+ ulint offset, /*!< in: least significant 32 bits of file
+ offset where to read */
+ ulint offset_high,/*!< in: most significant 32 bits of
+ offset */
+ ulint n, /*!< in: number of bytes to read */
+ const char* src_file,/*!< in: file name where func invoked */
+ ulint src_line)/*!< in: line where the func invoked */
+{
+ ibool result;
+ struct PSI_file_locker* locker = NULL;
+
+ register_pfs_file_io_begin(locker, file, n, PSI_FILE_READ,
+ src_file, src_line);
+
+ result = os_file_read_no_error_handling_func(file, buf, offset,
+ offset_high, n);
+
+ register_pfs_file_io_end(locker, n);
+
+ return(result);
+}
+
+/*******************************************************************//**
+NOTE! Please use the corresponding macro os_file_write(), not directly
+this function!
+This is the performance schema instrumented wrapper function for
+os_file_write() which requests a synchronous write operation.
+@return TRUE if request was successful, FALSE if fail */
+UNIV_INLINE
+ibool
+pfs_os_file_write_func(
+/*===================*/
+ const char* name, /*!< in: name of the file or path as a
+ null-terminated string */
+ os_file_t file, /*!< in: handle to a file */
+ const void* buf, /*!< in: buffer from which to write */
+ ulint offset, /*!< in: least significant 32 bits of file
+ offset where to write */
+ ulint offset_high,/*!< in: most significant 32 bits of
+ offset */
+ ulint n, /*!< in: number of bytes to write */
+ const char* src_file,/*!< in: file name where func invoked */
+ ulint src_line)/*!< in: line where the func invoked */
+{
+ ibool result;
+ struct PSI_file_locker* locker = NULL;
+
+ register_pfs_file_io_begin(locker, file, n, PSI_FILE_WRITE,
+ src_file, src_line);
+
+ result = os_file_write_func(name, file, buf, offset, offset_high, n);
+
+ register_pfs_file_io_end(locker, n);
+
+ return(result);
+}
+
+/***********************************************************************//**
+NOTE! Please use the corresponding macro os_file_flush(), not directly
+this function!
+This is the performance schema instrumented wrapper function for
+os_file_flush() which flushes the write buffers of a given file to the disk.
+@return TRUE if success */
+UNIV_INLINE
+ibool
+pfs_os_file_flush_func(
+/*===================*/
+ os_file_t file, /*!< in, own: handle to a file */
+ const char* src_file,/*!< in: file name where func invoked */
+ ulint src_line)/*!< in: line where the func invoked */
+{
+ ibool result;
+ struct PSI_file_locker* locker = NULL;
+
+ register_pfs_file_io_begin(locker, file, 0, PSI_FILE_SYNC,
+ src_file, src_line);
+ result = os_file_flush_func(file);
+
+ register_pfs_file_io_end(locker, 0);
+
+ return(result);
+}
+
+/***********************************************************************//**
+NOTE! Please use the corresponding macro os_file_rename(), not directly
+this function!
+This is the performance schema instrumented wrapper function for
+os_file_rename()
+@return TRUE if success */
+UNIV_INLINE
+ibool
+pfs_os_file_rename_func(
+/*====================*/
+ mysql_pfs_key_t key, /*!< in: Performance Schema Key */
+ const char* oldpath,/*!< in: old file path as a null-terminated
+ string */
+ const char* newpath,/*!< in: new file path */
+ const char* src_file,/*!< in: file name where func invoked */
+ ulint src_line)/*!< in: line where the func invoked */
+{
+ ibool result;
+ struct PSI_file_locker* locker = NULL;
+
+ register_pfs_file_open_begin(locker, key, PSI_FILE_RENAME, newpath,
+ src_file, src_line);
+
+ result = os_file_rename_func(oldpath, newpath);
+
+ register_pfs_file_open_end(locker, 0);
+
+ return(result);
+}
+#endif /* UNIV_PFS_IO */
diff --git a/storage/innobase/include/os0thread.h b/storage/innobase/include/os0thread.h
index 6583de0005f..cc56e2158ee 100644
--- a/storage/innobase/include/os0thread.h
+++ b/storage/innobase/include/os0thread.h
@@ -56,6 +56,11 @@ typedef os_thread_t os_thread_id_t; /*!< In Unix we use the thread
/* Define a function pointer type to use in a typecast */
typedef void* (*os_posix_f_t) (void*);
+#ifdef HAVE_PSI_INTERFACE
+/* Define for performance schema registration key */
+typedef unsigned int mysql_pfs_key_t;
+#endif
+
/***************************************************************//**
Compares two thread ids for equality.
@return TRUE if equal */
@@ -86,7 +91,7 @@ os_thread_t
os_thread_create(
/*=============*/
#ifndef __WIN__
- os_posix_f_t start_f,
+ os_posix_f_t start_f,
#else
ulint (*start_f)(void*), /*!< in: pointer to function
from which to start */
diff --git a/storage/innobase/include/pars0pars.h b/storage/innobase/include/pars0pars.h
index fe5d76ebbb0..524fe4ac3e7 100644
--- a/storage/innobase/include/pars0pars.h
+++ b/storage/innobase/include/pars0pars.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -520,6 +520,23 @@ pars_info_add_int4_literal(
Equivalent to:
char buf[8];
+mach_write_ull(buf, val);
+pars_info_add_literal(info, name, buf, 8, DATA_INT, 0);
+
+except that the buffer is dynamically allocated from the info struct's
+heap. */
+UNIV_INTERN
+void
+pars_info_add_uint64_literal(
+/*=========================*/
+ pars_info_t* info, /*!< in: info struct */
+ const char* name, /*!< in: name */
+ ib_uint64_t val); /*!< in: value */
+
+/****************************************************************//**
+Equivalent to:
+
+char buf[8];
mach_write_to_8(buf, val);
pars_info_add_literal(info, name, buf, 8, DATA_BINARY, 0);
diff --git a/storage/innobase/include/que0que.h b/storage/innobase/include/que0que.h
index 420f34550e2..39f8d07af89 100644
--- a/storage/innobase/include/que0que.h
+++ b/storage/innobase/include/que0que.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -30,6 +30,7 @@ Created 5/27/1996 Heikki Tuuri
#include "data0data.h"
#include "dict0types.h"
#include "trx0trx.h"
+#include "trx0roll.h"
#include "srv0srv.h"
#include "usr0types.h"
#include "que0types.h"
@@ -215,6 +216,16 @@ trx_t*
thr_get_trx(
/*========*/
que_thr_t* thr); /*!< in: query thread */
+/*******************************************************************//**
+Determines if this thread is rolling back an incomplete transaction
+in crash recovery.
+@return TRUE if thr is rolling back an incomplete transaction in crash
+recovery */
+UNIV_INLINE
+ibool
+thr_is_recv(
+/*========*/
+ const que_thr_t* thr); /*!< in: query thread */
/***********************************************************************//**
Gets the type of a graph node. */
UNIV_INLINE
diff --git a/storage/innobase/include/que0que.ic b/storage/innobase/include/que0que.ic
index a1c0dc1e77a..bd936670e1e 100644
--- a/storage/innobase/include/que0que.ic
+++ b/storage/innobase/include/que0que.ic
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -38,6 +38,20 @@ thr_get_trx(
return(thr->graph->trx);
}
+/*******************************************************************//**
+Determines if this thread is rolling back an incomplete transaction
+in crash recovery.
+@return TRUE if thr is rolling back an incomplete transaction in crash
+recovery */
+UNIV_INLINE
+ibool
+thr_is_recv(
+/*========*/
+ const que_thr_t* thr) /*!< in: query thread */
+{
+ return(trx_is_recv(thr->graph->trx));
+}
+
/***********************************************************************//**
Gets the first thr in a fork. */
UNIV_INLINE
diff --git a/storage/innobase/include/row0merge.h b/storage/innobase/include/row0merge.h
index fbeb125ce7b..be7c77e7724 100644
--- a/storage/innobase/include/row0merge.h
+++ b/storage/innobase/include/row0merge.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 2005, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2005, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
diff --git a/storage/innobase/include/row0mysql.h b/storage/innobase/include/row0mysql.h
index b05241f00f8..d2a8734c61f 100644
--- a/storage/innobase/include/row0mysql.h
+++ b/storage/innobase/include/row0mysql.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 2000, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2000, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -451,6 +451,12 @@ row_drop_table_for_mysql(
const char* name, /*!< in: table name */
trx_t* trx, /*!< in: transaction handle */
ibool drop_db);/*!< in: TRUE=dropping whole database */
+/*********************************************************************//**
+Drop all temporary tables during crash recovery. */
+UNIV_INTERN
+void
+row_mysql_drop_temp_tables(void);
+/*============================*/
/*********************************************************************//**
Discards the tablespace of a table which stored in an .ibd file. Discarding
@@ -494,14 +500,19 @@ row_rename_table_for_mysql(
trx_t* trx, /*!< in: transaction handle */
ibool commit); /*!< in: if TRUE then commit trx */
/*********************************************************************//**
-Checks a table for corruption.
-@return DB_ERROR or DB_SUCCESS */
+Checks that the index contains entries in an ascending order, unique
+constraint is not broken, and calculates the number of index entries
+in the read view of the current transaction.
+@return DB_SUCCESS if ok */
UNIV_INTERN
ulint
-row_check_table_for_mysql(
+row_check_index_for_mysql(
/*======================*/
- row_prebuilt_t* prebuilt); /*!< in: prebuilt struct in MySQL
- handle */
+ row_prebuilt_t* prebuilt, /*!< in: prebuilt struct
+ in MySQL handle */
+ const dict_index_t* index, /*!< in: index */
+ ulint* n_rows); /*!< out: number of entries
+ seen in the consistent read */
/*********************************************************************//**
Determines if a table is a magic monitor table.
diff --git a/storage/innobase/include/row0purge.h b/storage/innobase/include/row0purge.h
index 89ec54fb54a..485d51dbc83 100644
--- a/storage/innobase/include/row0purge.h
+++ b/storage/innobase/include/row0purge.h
@@ -45,6 +45,28 @@ row_purge_node_create(
que_thr_t* parent, /*!< in: parent node, i.e., a thr node */
mem_heap_t* heap); /*!< in: memory heap where created */
/***********************************************************//**
+Determines if it is possible to remove a secondary index entry.
+Removal is possible if the secondary index entry does not refer to any
+not delete marked version of a clustered index record where DB_TRX_ID
+is newer than the purge view.
+
+NOTE: This function should only be called by the purge thread, only
+while holding a latch on the leaf page of the secondary index entry
+(or keeping the buffer pool watch on the page). It is possible that
+this function first returns TRUE and then FALSE, if a user transaction
+inserts a record that the secondary index entry would refer to.
+However, in that case, the user transaction would also re-insert the
+secondary index entry after purge has removed it and released the leaf
+page latch.
+@return TRUE if the secondary index record can be purged */
+UNIV_INTERN
+ibool
+row_purge_poss_sec(
+/*===============*/
+ purge_node_t* node, /*!< in/out: row purge node */
+ dict_index_t* index, /*!< in: secondary index */
+ const dtuple_t* entry); /*!< in: secondary index entry */
+/***************************************************************
Does the purge operation for a single undo log record. This is a high-level
function used in an SQL execution graph.
@return query thread to run next or NULL */
diff --git a/storage/innobase/include/row0row.h b/storage/innobase/include/row0row.h
index 723b7b53395..195691a420b 100644
--- a/storage/innobase/include/row0row.h
+++ b/storage/innobase/include/row0row.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -257,11 +257,25 @@ row_get_clust_rec(
dict_index_t* index, /*!< in: secondary index */
dict_index_t** clust_index,/*!< out: clustered index */
mtr_t* mtr); /*!< in: mtr */
+
+/** Result of row_search_index_entry */
+enum row_search_result {
+ ROW_FOUND = 0, /*!< the record was found */
+ ROW_NOT_FOUND, /*!< record not found */
+ ROW_BUFFERED, /*!< one of BTR_INSERT, BTR_DELETE, or
+ BTR_DELETE_MARK was specified, the
+ secondary index leaf page was not in
+ the buffer pool, and the operation was
+ enqueued in the insert/delete buffer */
+ ROW_NOT_DELETED_REF, /*!< BTR_DELETE was specified, and
+ row_purge_poss_sec() failed */
+};
+
/***************************************************************//**
Searches an index record.
-@return TRUE if found */
+@return whether the record was found or buffered */
UNIV_INTERN
-ibool
+enum row_search_result
row_search_index_entry(
/*===================*/
dict_index_t* index, /*!< in: index */
diff --git a/storage/innobase/include/row0sel.h b/storage/innobase/include/row0sel.h
index 01a5afaa23e..8544b9d08ba 100644
--- a/storage/innobase/include/row0sel.h
+++ b/storage/innobase/include/row0sel.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -105,17 +105,6 @@ row_fetch_print(
/*============*/
void* row, /*!< in: sel_node_t* */
void* user_arg); /*!< in: not used */
-/****************************************************************//**
-Callback function for fetch that stores an unsigned 4 byte integer to the
-location pointed. The column's type must be DATA_INT, DATA_UNSIGNED, length
-= 4.
-@return always returns NULL */
-UNIV_INTERN
-void*
-row_fetch_store_uint4(
-/*==================*/
- void* row, /*!< in: sel_node_t* */
- void* user_arg); /*!< in: data pointer */
/***********************************************************//**
Prints a row in a select result.
@return query thread to run next or NULL */
diff --git a/storage/innobase/include/row0types.h b/storage/innobase/include/row0types.h
index 1be729206ba..7d6a7c8e2b1 100644
--- a/storage/innobase/include/row0types.h
+++ b/storage/innobase/include/row0types.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
diff --git a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h
index ce7f6004813..74c604124f5 100644
--- a/storage/innobase/include/srv0srv.h
+++ b/storage/innobase/include/srv0srv.h
@@ -1,7 +1,8 @@
/*****************************************************************************
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
Copyright (c) 2008, 2009, Google Inc.
+Copyright (c) 2009, Percona Inc.
Portions of this file contain modifications contributed and copyrighted by
Google, Inc. Those modifications are gratefully acknowledged and are described
@@ -9,6 +10,13 @@ briefly in the InnoDB documentation. The contributions by Google are
incorporated with their permission, and subject to the conditions contained in
the file COPYING.Google.
+Portions of this file contain modifications contributed and copyrighted
+by Percona Inc.. Those modifications are
+gratefully acknowledged and are described briefly in the InnoDB
+documentation. The contributions by Percona Inc. are incorporated with
+their permission, and subject to the conditions contained in the file
+COPYING.Percona.
+
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
Foundation; version 2 of the License.
@@ -22,32 +30,6 @@ this program; if not, write to the Free Software Foundation, Inc., 59 Temple
Place, Suite 330, Boston, MA 02111-1307 USA
*****************************************************************************/
-/***********************************************************************
-
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
-Copyright (c) 2009, Percona Inc.
-
-Portions of this file contain modifications contributed and copyrighted
-by Percona Inc.. Those modifications are
-gratefully acknowledged and are described briefly in the InnoDB
-documentation. The contributions by Percona Inc. are incorporated with
-their permission, and subject to the conditions contained in the file
-COPYING.Percona.
-
-This program is free software; you can redistribute it and/or modify it
-under the terms of the GNU General Public License as published by the
-Free Software Foundation; version 2 of the License.
-
-This program is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
-Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-
-***********************************************************************/
/**************************************************//**
@file include/srv0srv.h
@@ -125,6 +107,11 @@ on duplicate key checking and foreign key checking */
extern ibool srv_locks_unsafe_for_binlog;
#endif /* !UNIV_HOTBACKUP */
+/* If this flag is TRUE, then we will use the native aio of the
+OS (provided we compiled Innobase with it in), otherwise we will
+use simulated aio we build below with threads.
+Currently we support native aio on windows and linux */
+extern my_bool srv_use_native_aio;
extern ulint srv_n_data_files;
extern char** srv_data_file_names;
extern ulint* srv_data_file_sizes;
@@ -224,7 +211,8 @@ extern ibool srv_print_innodb_tablespace_monitor;
extern ibool srv_print_verbose_log;
extern ibool srv_print_innodb_table_monitor;
-extern ibool srv_lock_timeout_and_monitor_active;
+extern ibool srv_lock_timeout_active;
+extern ibool srv_monitor_active;
extern ibool srv_error_monitor_active;
extern ulong srv_n_spin_wait_rounds;
@@ -283,6 +271,12 @@ extern ulint srv_os_log_pending_writes;
log buffer and have to flush it */
extern ulint srv_log_waits;
+/* the number of purge threads to use from the worker pool (currently 0 or 1) */
+extern ulint srv_n_purge_threads;
+
+/* the number of records to purge in one batch */
+extern ulint srv_purge_batch_size;
+
/* variable that counts amount of data read in total (in bytes) */
extern ulint srv_data_read;
@@ -324,6 +318,37 @@ typedef struct srv_sys_struct srv_sys_t;
/** The server system */
extern srv_sys_t* srv_sys;
+
+# ifdef UNIV_PFS_THREAD
+/* Keys to register InnoDB threads with performance schema */
+extern mysql_pfs_key_t trx_rollback_clean_thread_key;
+extern mysql_pfs_key_t io_handler_thread_key;
+extern mysql_pfs_key_t srv_lock_timeout_thread_key;
+extern mysql_pfs_key_t srv_error_monitor_thread_key;
+extern mysql_pfs_key_t srv_monitor_thread_key;
+extern mysql_pfs_key_t srv_master_thread_key;
+
+/* This macro register the current thread and its key with performance
+schema */
+# define pfs_register_thread(key) \
+do { \
+ if (PSI_server) { \
+ struct PSI_thread* psi = PSI_server->new_thread(key, NULL, 0);\
+ if (psi) { \
+ PSI_server->set_thread(psi); \
+ } \
+ } \
+} while (0)
+
+/* This macro delist the current thread from performance schema */
+# define pfs_delete_thread() \
+do { \
+ if (PSI_server) { \
+ PSI_server->delete_current_thread(); \
+ } \
+} while (0)
+# endif /* UNIV_PFS_THREAD */
+
#endif /* !UNIV_HOTBACKUP */
/** Types of raw partitions in innodb_data_file_path */
@@ -464,6 +489,12 @@ srv_master_thread(
void* arg); /*!< in: a dummy parameter required by
os_thread_create */
/*******************************************************************//**
+Wakes up the purge thread if it's not already awake. */
+UNIV_INTERN
+void
+srv_wake_purge_thread(void);
+/*=======================*/
+/*******************************************************************//**
Tells the Innobase server that there has been activity in the database
and wakes up the master thread if it is suspended (not sleeping). Used
in the MySQL interface. Note that there is a small chance that the master
@@ -479,6 +510,16 @@ UNIV_INTERN
void
srv_wake_master_thread(void);
/*========================*/
+/*******************************************************************//**
+Tells the purge thread that there has been activity in the database
+and wakes up the purge thread if it is suspended (not sleeping). Note
+that there is a small chance that the purge thread stays suspended
+(we do not protect our operation with the kernel mutex, for
+performace reasons). */
+UNIV_INTERN
+void
+srv_wake_purge_thread_if_not_active(void);
+/*=====================================*/
/*********************************************************************//**
Puts an OS thread to wait if there are too many concurrent threads
(>= srv_thread_concurrency) inside InnoDB. The threads wait in a FIFO queue. */
@@ -537,15 +578,23 @@ srv_release_mysql_thread_if_suspended(
MySQL OS thread */
/*********************************************************************//**
A thread which wakes up threads whose lock wait may have lasted too long.
-This also prints the info output by various InnoDB monitors.
@return a dummy parameter */
UNIV_INTERN
os_thread_ret_t
-srv_lock_timeout_and_monitor_thread(
-/*================================*/
+srv_lock_timeout_thread(
+/*====================*/
void* arg); /*!< in: a dummy parameter required by
os_thread_create */
/*********************************************************************//**
+A thread which prints the info output by various InnoDB monitors.
+@return a dummy parameter */
+UNIV_INTERN
+os_thread_ret_t
+srv_monitor_thread(
+/*===============*/
+ void* arg); /*!< in: a dummy parameter required by
+ os_thread_create */
+/*************************************************************************
A thread which prints warnings about semaphore waits which have lasted
too long. These can be used to track bugs which cause hangs.
@return a dummy parameter */
@@ -556,12 +605,15 @@ srv_error_monitor_thread(
void* arg); /*!< in: a dummy parameter required by
os_thread_create */
/******************************************************************//**
-Outputs to a file the output of the InnoDB Monitor. */
+Outputs to a file the output of the InnoDB Monitor.
+@return FALSE if not all information printed
+due to failure to obtain necessary mutex */
UNIV_INTERN
-void
+ibool
srv_printf_innodb_monitor(
/*======================*/
FILE* file, /*!< in: output stream */
+ ibool nowait, /*!< in: whether to wait for kernel mutex */
ulint* trx_start, /*!< out: file position of the start of
the list of active transactions */
ulint* trx_end); /*!< out: file position of the end of
@@ -574,6 +626,15 @@ void
srv_export_innodb_status(void);
/*==========================*/
+/*********************************************************************//**
+Asynchronous purge thread.
+@return a dummy parameter */
+UNIV_INTERN
+os_thread_ret_t
+srv_purge_thread(
+/*=============*/
+ void* arg __attribute__((unused))); /*!< in: a dummy parameter
+ required by os_thread_create */
/** Thread slot in the thread table */
typedef struct srv_slot_struct srv_slot_t;
@@ -643,8 +704,9 @@ struct srv_sys_struct{
extern ulint srv_n_threads_active[];
#else /* !UNIV_HOTBACKUP */
-# define srv_use_checksums TRUE
# define srv_use_adaptive_hash_indexes FALSE
+# define srv_use_checksums TRUE
+# define srv_use_native_aio FALSE
# define srv_force_recovery 0UL
# define srv_set_io_thread_op_info(t,info) ((void) 0)
# define srv_is_being_started 0
diff --git a/storage/innobase/include/sync0rw.h b/storage/innobase/include/sync0rw.h
index aedfd5f3f86..6233ceef748 100644
--- a/storage/innobase/include/sync0rw.h
+++ b/storage/innobase/include/sync0rw.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
Copyright (c) 2008, Google Inc.
Portions of this file contain modifications contributed and copyrighted by
@@ -105,23 +105,140 @@ extern ib_int64_t rw_x_os_wait_count;
set only when UNIV_SYNC_PERF_STAT is defined */
extern ib_int64_t rw_x_exit_count;
+#ifdef UNIV_PFS_RWLOCK
+/* Following are rwlock keys used to register with MySQL
+performance schema */
+# ifdef UNIV_LOG_ARCHIVE
+extern mysql_pfs_key_t archive_lock_key;
+# endif /* UNIV_LOG_ARCHIVE */
+extern mysql_pfs_key_t btr_search_latch_key;
+extern mysql_pfs_key_t buf_block_lock_key;
+# ifdef UNIV_SYNC_DEBUG
+extern mysql_pfs_key_t buf_block_debug_latch_key;
+# endif /* UNIV_SYNC_DEBUG */
+extern mysql_pfs_key_t dict_operation_lock_key;
+extern mysql_pfs_key_t fil_space_latch_key;
+extern mysql_pfs_key_t checkpoint_lock_key;
+extern mysql_pfs_key_t trx_i_s_cache_lock_key;
+extern mysql_pfs_key_t trx_purge_latch_key;
+extern mysql_pfs_key_t index_tree_rw_lock_key;
+#endif /* UNIV_PFS_RWLOCK */
+
+
+#ifndef UNIV_PFS_RWLOCK
/******************************************************************//**
Creates, or rather, initializes an rw-lock object in a specified memory
location (which must be appropriately aligned). The rw-lock is initialized
to the non-locked state. Explicit freeing of the rw-lock with rw_lock_free
-is necessary only if the memory block containing it is freed. */
-#ifdef UNIV_DEBUG
-# ifdef UNIV_SYNC_DEBUG
-# define rw_lock_create(L, level) \
+is necessary only if the memory block containing it is freed.
+if MySQL performance schema is enabled and "UNIV_PFS_RWLOCK" is
+defined, the rwlock are instrumented with performance schema probes. */
+# ifdef UNIV_DEBUG
+# ifdef UNIV_SYNC_DEBUG
+# define rw_lock_create(K, L, level) \
rw_lock_create_func((L), (level), #L, __FILE__, __LINE__)
-# else /* UNIV_SYNC_DEBUG */
-# define rw_lock_create(L, level) \
+# else /* UNIV_SYNC_DEBUG */
+# define rw_lock_create(K, L, level) \
rw_lock_create_func((L), #L, __FILE__, __LINE__)
-# endif /* UNIV_SYNC_DEBUG */
-#else /* UNIV_DEBUG */
-# define rw_lock_create(L, level) \
+# endif/* UNIV_SYNC_DEBUG */
+# else /* UNIV_DEBUG */
+# define rw_lock_create(K, L, level) \
rw_lock_create_func((L), __FILE__, __LINE__)
-#endif /* UNIV_DEBUG */
+# endif /* UNIV_DEBUG */
+
+/**************************************************************//**
+NOTE! The following macros should be used in rw locking and
+unlocking, not the corresponding function. */
+
+# define rw_lock_s_lock(M) \
+ rw_lock_s_lock_func((M), 0, __FILE__, __LINE__)
+
+# define rw_lock_s_lock_gen(M, P) \
+ rw_lock_s_lock_func((M), (P), __FILE__, __LINE__)
+
+# define rw_lock_s_lock_nowait(M, F, L) \
+ rw_lock_s_lock_low((M), 0, (F), (L))
+
+# ifdef UNIV_SYNC_DEBUG
+# define rw_lock_s_unlock_gen(L, P) rw_lock_s_unlock_func(P, L)
+# else
+# define rw_lock_s_unlock_gen(L, P) rw_lock_s_unlock_func(L)
+# endif
+
+
+# define rw_lock_x_lock(M) \
+ rw_lock_x_lock_func((M), 0, __FILE__, __LINE__)
+
+# define rw_lock_x_lock_gen(M, P) \
+ rw_lock_x_lock_func((M), (P), __FILE__, __LINE__)
+
+# define rw_lock_x_lock_nowait(M) \
+ rw_lock_x_lock_func_nowait((M), __FILE__, __LINE__)
+
+# ifdef UNIV_SYNC_DEBUG
+# define rw_lock_x_unlock_gen(L, P) rw_lock_x_unlock_func(P, L)
+# else
+# define rw_lock_x_unlock_gen(L, P) rw_lock_x_unlock_func(L)
+# endif
+
+# define rw_lock_free(M) rw_lock_free_func(M)
+
+#else /* !UNIV_PFS_RWLOCK */
+
+/* Following macros point to Performance Schema instrumented functions. */
+# ifdef UNIV_DEBUG
+# ifdef UNIV_SYNC_DEBUG
+# define rw_lock_create(K, L, level) \
+ pfs_rw_lock_create_func((K), (L), (level), #L, __FILE__, __LINE__)
+# else /* UNIV_SYNC_DEBUG */
+# define rw_lock_create(K, L, level) \
+ pfs_rw_lock_create_func((K), (L), #L, __FILE__, __LINE__)
+# endif/* UNIV_SYNC_DEBUG */
+# else /* UNIV_DEBUG */
+# define rw_lock_create(K, L, level) \
+ pfs_rw_lock_create_func((K), (L), __FILE__, __LINE__)
+# endif /* UNIV_DEBUG */
+
+/******************************************************************
+NOTE! The following macros should be used in rw locking and
+unlocking, not the corresponding function. */
+
+# define rw_lock_s_lock(M) \
+ pfs_rw_lock_s_lock_func((M), 0, __FILE__, __LINE__)
+
+# define rw_lock_s_lock_gen(M, P) \
+ pfs_rw_lock_s_lock_func((M), (P), __FILE__, __LINE__)
+
+# define rw_lock_s_lock_nowait(M, F, L) \
+ pfs_rw_lock_s_lock_low((M), 0, (F), (L))
+
+# ifdef UNIV_SYNC_DEBUG
+# define rw_lock_s_unlock_gen(L, P) pfs_rw_lock_s_unlock_func(P, L)
+# else
+# define rw_lock_s_unlock_gen(L, P) pfs_rw_lock_s_unlock_func(L)
+# endif
+
+# define rw_lock_x_lock(M) \
+ pfs_rw_lock_x_lock_func((M), 0, __FILE__, __LINE__)
+
+# define rw_lock_x_lock_gen(M, P) \
+ pfs_rw_lock_x_lock_func((M), (P), __FILE__, __LINE__)
+
+# define rw_lock_x_lock_nowait(M) \
+ pfs_rw_lock_x_lock_func_nowait((M), __FILE__, __LINE__)
+
+# ifdef UNIV_SYNC_DEBUG
+# define rw_lock_x_unlock_gen(L, P) pfs_rw_lock_x_unlock_func(P, L)
+# else
+# define rw_lock_x_unlock_gen(L, P) pfs_rw_lock_x_unlock_func(L)
+# endif
+
+# define rw_lock_free(M) pfs_rw_lock_free_func(M)
+
+#endif /* UNIV_PFS_RWLOCK */
+
+#define rw_lock_s_unlock(L) rw_lock_s_unlock_gen(L, 0)
+#define rw_lock_x_unlock(L) rw_lock_x_unlock_gen(L, 0)
/******************************************************************//**
Creates, or rather, initializes an rw-lock object in a specified memory
@@ -137,18 +254,18 @@ rw_lock_create_func(
# ifdef UNIV_SYNC_DEBUG
ulint level, /*!< in: level */
# endif /* UNIV_SYNC_DEBUG */
- const char* cmutex_name, /*!< in: mutex name */
+ const char* cmutex_name, /*!< in: mutex name */
#endif /* UNIV_DEBUG */
const char* cfile_name, /*!< in: file name where created */
- ulint cline); /*!< in: file line where created */
+ ulint cline); /*!< in: file line where created */
/******************************************************************//**
Calling this function is obligatory only if the memory buffer containing
the rw-lock is freed. Removes an rw-lock object from the global list. The
rw-lock is checked to be in the non-locked state. */
UNIV_INTERN
void
-rw_lock_free(
-/*=========*/
+rw_lock_free_func(
+/*==============*/
rw_lock_t* lock); /*!< in: rw-lock */
#ifdef UNIV_DEBUG
/******************************************************************//**
@@ -161,24 +278,6 @@ rw_lock_validate(
/*=============*/
rw_lock_t* lock); /*!< in: rw-lock */
#endif /* UNIV_DEBUG */
-/**************************************************************//**
-NOTE! The following macros should be used in rw s-locking, not the
-corresponding function. */
-
-#define rw_lock_s_lock(M) rw_lock_s_lock_func(\
- (M), 0, __FILE__, __LINE__)
-/**************************************************************//**
-NOTE! The following macros should be used in rw s-locking, not the
-corresponding function. */
-
-#define rw_lock_s_lock_gen(M, P) rw_lock_s_lock_func(\
- (M), (P), __FILE__, __LINE__)
-/**************************************************************//**
-NOTE! The following macros should be used in rw s-locking, not the
-corresponding function. */
-
-#define rw_lock_s_lock_nowait(M, F, L) rw_lock_s_lock_low(\
- (M), 0, (F), (L))
/******************************************************************//**
Low-level function which tries to lock an rw-lock in s-mode. Performs no
spinning.
@@ -233,33 +332,6 @@ rw_lock_s_unlock_func(
#endif
rw_lock_t* lock); /*!< in/out: rw-lock */
-#ifdef UNIV_SYNC_DEBUG
-# define rw_lock_s_unlock_gen(L, P) rw_lock_s_unlock_func(P, L)
-#else
-# define rw_lock_s_unlock_gen(L, P) rw_lock_s_unlock_func(L)
-#endif
-/*******************************************************************//**
-Releases a shared mode lock. */
-#define rw_lock_s_unlock(L) rw_lock_s_unlock_gen(L, 0)
-
-/**************************************************************//**
-NOTE! The following macro should be used in rw x-locking, not the
-corresponding function. */
-
-#define rw_lock_x_lock(M) rw_lock_x_lock_func(\
- (M), 0, __FILE__, __LINE__)
-/**************************************************************//**
-NOTE! The following macro should be used in rw x-locking, not the
-corresponding function. */
-
-#define rw_lock_x_lock_gen(M, P) rw_lock_x_lock_func(\
- (M), (P), __FILE__, __LINE__)
-/**************************************************************//**
-NOTE! The following macros should be used in rw x-locking, not the
-corresponding function. */
-
-#define rw_lock_x_lock_nowait(M) rw_lock_x_lock_func_nowait(\
- (M), __FILE__, __LINE__)
/******************************************************************//**
NOTE! Use the corresponding macro, not directly this function! Lock an
rw-lock in exclusive mode for the current thread. If the rw-lock is locked
@@ -290,14 +362,6 @@ rw_lock_x_unlock_func(
#endif
rw_lock_t* lock); /*!< in/out: rw-lock */
-#ifdef UNIV_SYNC_DEBUG
-# define rw_lock_x_unlock_gen(L, P) rw_lock_x_unlock_func(P, L)
-#else
-# define rw_lock_x_unlock_gen(L, P) rw_lock_x_unlock_func(L)
-#endif
-/*******************************************************************//**
-Releases an exclusive mode lock. */
-#define rw_lock_x_unlock(L) rw_lock_x_unlock_gen(L, 0)
/******************************************************************//**
Low-level function which locks an rw-lock in s-mode when we know that it
@@ -429,8 +493,9 @@ ibool
rw_lock_own(
/*========*/
rw_lock_t* lock, /*!< in: rw-lock */
- ulint lock_type); /*!< in: lock type: RW_LOCK_SHARED,
+ ulint lock_type) /*!< in: lock type: RW_LOCK_SHARED,
RW_LOCK_EX */
+ __attribute__((warn_unused_result));
#endif /* UNIV_SYNC_DEBUG */
/******************************************************************//**
Checks if somebody has locked the rw-lock in the specified mode. */
@@ -539,6 +604,9 @@ struct rw_lock_struct {
info list of the lock */
ulint level; /*!< Level in the global latching order. */
#endif /* UNIV_SYNC_DEBUG */
+#ifdef UNIV_PFS_RWLOCK
+ struct PSI_rwlock *pfs_psi;/*!< The instrumentation hook */
+#endif
ulint count_os_wait; /*!< Count of os_waits. May not be accurate */
const char* cfile_name;/*!< File name where lock created */
/* last s-lock file/line is not guaranteed to be correct */
@@ -577,6 +645,160 @@ struct rw_lock_debug_struct {
};
#endif /* UNIV_SYNC_DEBUG */
+/* For performance schema instrumentation, a new set of rwlock
+wrap functions are created if "UNIV_PFS_RWLOCK" is defined.
+The instrumentations are not planted directly into original
+functions, so that we keep the underlying function as they
+are. And in case, user wants to "take out" some rwlock from
+instrumentation even if performance schema (UNIV_PFS_RWLOCK)
+is defined, they can do so by reinstating APIs directly link to
+original underlying functions.
+The instrumented function names have prefix of "pfs_rw_lock_" vs.
+original name prefix of "rw_lock_". Following are list of functions
+that have been instrumented:
+
+rw_lock_create()
+rw_lock_x_lock()
+rw_lock_x_lock_gen()
+rw_lock_x_lock_nowait()
+rw_lock_x_unlock_gen()
+rw_lock_s_lock()
+rw_lock_s_lock_gen()
+rw_lock_s_lock_nowait()
+rw_lock_s_unlock_gen()
+rw_lock_free()
+
+Two function APIs rw_lock_x_unlock_direct() and rw_lock_s_unlock_direct()
+do not have any caller/user, they are not instrumented.
+*/
+
+#ifdef UNIV_PFS_RWLOCK
+/******************************************************************//**
+Performance schema instrumented wrap function for rw_lock_create_func()
+NOTE! Please use the corresponding macro rw_lock_create(), not
+directly this function! */
+UNIV_INLINE
+void
+pfs_rw_lock_create_func(
+/*====================*/
+ PSI_rwlock_key key, /*!< in: key registered with
+ performance schema */
+ rw_lock_t* lock, /*!< in: rw lock */
+#ifdef UNIV_DEBUG
+# ifdef UNIV_SYNC_DEBUG
+ ulint level, /*!< in: level */
+# endif /* UNIV_SYNC_DEBUG */
+ const char* cmutex_name, /*!< in: mutex name */
+#endif /* UNIV_DEBUG */
+ const char* cfile_name, /*!< in: file name where created */
+ ulint cline); /*!< in: file line where created */
+
+/******************************************************************//**
+Performance schema instrumented wrap function for rw_lock_x_lock_func()
+NOTE! Please use the corresponding macro rw_lock_x_lock(), not
+directly this function! */
+UNIV_INLINE
+void
+pfs_rw_lock_x_lock_func(
+/*====================*/
+ rw_lock_t* lock, /*!< in: pointer to rw-lock */
+ ulint pass, /*!< in: pass value; != 0, if the lock will
+ be passed to another thread to unlock */
+ const char* file_name,/*!< in: file name where lock requested */
+ ulint line); /*!< in: line where requested */
+/******************************************************************//**
+Performance schema instrumented wrap function for
+rw_lock_x_lock_func_nowait()
+NOTE! Please use the corresponding macro, not directly this function!
+@return TRUE if success */
+UNIV_INLINE
+ibool
+pfs_rw_lock_x_lock_func_nowait(
+/*===========================*/
+ rw_lock_t* lock, /*!< in: pointer to rw-lock */
+ const char* file_name,/*!< in: file name where lock requested */
+ ulint line); /*!< in: line where requested */
+/******************************************************************//**
+Performance schema instrumented wrap function for rw_lock_s_lock_func()
+NOTE! Please use the corresponding macro rw_lock_s_lock(), not directly
+this function! */
+UNIV_INLINE
+void
+pfs_rw_lock_s_lock_func(
+/*====================*/
+ rw_lock_t* lock, /*!< in: pointer to rw-lock */
+ ulint pass, /*!< in: pass value; != 0, if the lock will
+ be passed to another thread to unlock */
+ const char* file_name,/*!< in: file name where lock requested */
+ ulint line); /*!< in: line where requested */
+/******************************************************************//**
+Performance schema instrumented wrap function for rw_lock_s_lock_func()
+NOTE! Please use the corresponding macro rw_lock_s_lock(), not directly
+this function!
+@return TRUE if success */
+UNIV_INLINE
+ibool
+pfs_rw_lock_s_lock_low(
+/*===================*/
+ rw_lock_t* lock, /*!< in: pointer to rw-lock */
+ ulint pass, /*!< in: pass value; != 0, if the
+ lock will be passed to another
+ thread to unlock */
+ const char* file_name, /*!< in: file name where lock requested */
+ ulint line); /*!< in: line where requested */
+/******************************************************************//**
+Performance schema instrumented wrap function for rw_lock_x_lock_func()
+NOTE! Please use the corresponding macro rw_lock_x_lock(), not directly
+this function! */
+UNIV_INLINE
+void
+pfs_rw_lock_x_lock_func(
+/*====================*/
+ rw_lock_t* lock, /*!< in: pointer to rw-lock */
+ ulint pass, /*!< in: pass value; != 0, if the lock will
+ be passed to another thread to unlock */
+ const char* file_name,/*!< in: file name where lock requested */
+ ulint line); /*!< in: line where requested */
+/******************************************************************//**
+Performance schema instrumented wrap function for rw_lock_s_unlock_func()
+NOTE! Please use the corresponding macro rw_lock_s_unlock(), not directly
+this function! */
+UNIV_INLINE
+void
+pfs_rw_lock_s_unlock_func(
+/*======================*/
+#ifdef UNIV_SYNC_DEBUG
+ ulint pass, /*!< in: pass value; != 0, if the
+ lock may have been passed to another
+ thread to unlock */
+#endif
+ rw_lock_t* lock); /*!< in/out: rw-lock */
+/******************************************************************//**
+Performance schema instrumented wrap function for rw_lock_s_unlock_func()
+NOTE! Please use the corresponding macro rw_lock_x_unlock(), not directly
+this function! */
+UNIV_INLINE
+void
+pfs_rw_lock_x_unlock_func(
+/*======================*/
+#ifdef UNIV_SYNC_DEBUG
+ ulint pass, /*!< in: pass value; != 0, if the
+ lock may have been passed to another
+ thread to unlock */
+#endif
+ rw_lock_t* lock); /*!< in/out: rw-lock */
+/******************************************************************//**
+Performance schema instrumented wrap function for rw_lock_free_func()
+NOTE! Please use the corresponding macro rw_lock_free(), not directly
+this function! */
+UNIV_INLINE
+void
+pfs_rw_lock_free_func(
+/*==================*/
+ rw_lock_t* lock); /*!< in: rw-lock */
+#endif /* UNIV_PFS_RWLOCK */
+
+
#ifndef UNIV_NONINL
#include "sync0rw.ic"
#endif
diff --git a/storage/innobase/include/sync0rw.ic b/storage/innobase/include/sync0rw.ic
index 7116f1b7c9b..73405759bce 100644
--- a/storage/innobase/include/sync0rw.ic
+++ b/storage/innobase/include/sync0rw.ic
@@ -622,3 +622,265 @@ rw_lock_x_unlock_direct(
rw_x_exit_count++;
#endif
}
+
+#ifdef UNIV_PFS_RWLOCK
+
+/******************************************************************//**
+Performance schema instrumented wrap function for rw_lock_create_func().
+NOTE! Please use the corresponding macro rw_lock_create(), not directly
+this function! */
+UNIV_INLINE
+void
+pfs_rw_lock_create_func(
+/*====================*/
+ mysql_pfs_key_t key, /*!< in: key registered with
+ performance schema */
+ rw_lock_t* lock, /*!< in: pointer to memory */
+# ifdef UNIV_DEBUG
+# ifdef UNIV_SYNC_DEBUG
+ ulint level, /*!< in: level */
+# endif /* UNIV_SYNC_DEBUG */
+ const char* cmutex_name, /*!< in: mutex name */
+# endif /* UNIV_DEBUG */
+ const char* cfile_name, /*!< in: file name where created */
+ ulint cline) /*!< in: file line where created */
+{
+ /* Initialize the rwlock for performance schema */
+ lock->pfs_psi = (PSI_server && PFS_IS_INSTRUMENTED(key))
+ ? PSI_server->init_rwlock(key, lock)
+ : NULL;
+
+ /* The actual function to initialize an rwlock */
+ rw_lock_create_func(lock,
+# ifdef UNIV_DEBUG
+# ifdef UNIV_SYNC_DEBUG
+ level,
+# endif /* UNIV_SYNC_DEBUG */
+ cmutex_name,
+# endif /* UNIV_DEBUG */
+ cfile_name,
+ cline);
+}
+/******************************************************************//**
+Performance schema instrumented wrap function for rw_lock_x_lock_func()
+NOTE! Please use the corresponding macro rw_lock_x_lock(), not directly
+this function! */
+UNIV_INLINE
+void
+pfs_rw_lock_x_lock_func(
+/*====================*/
+ rw_lock_t* lock, /*!< in: pointer to rw-lock */
+ ulint pass, /*!< in: pass value; != 0, if the lock will
+ be passed to another thread to unlock */
+ const char* file_name,/*!< in: file name where lock requested */
+ ulint line) /*!< in: line where requested */
+{
+ struct PSI_rwlock_locker* locker = NULL;
+
+ /* Record the entry of rw x lock request in performance schema */
+ if (UNIV_LIKELY(PSI_server && lock->pfs_psi)) {
+ locker = PSI_server->get_thread_rwlock_locker(
+ lock->pfs_psi, PSI_RWLOCK_WRITELOCK);
+
+ if (locker) {
+ PSI_server->start_rwlock_wrwait(locker,
+ file_name, line);
+ }
+ }
+
+ rw_lock_x_lock_func(lock, pass, file_name, line);
+
+ if (locker) {
+ PSI_server->end_rwlock_wrwait(locker, 0);
+ }
+}
+/******************************************************************//**
+Performance schema instrumented wrap function for
+rw_lock_x_lock_func_nowait()
+NOTE! Please use the corresponding macro rw_lock_x_lock_func(),
+not directly this function!
+@return TRUE if success */
+UNIV_INLINE
+ibool
+pfs_rw_lock_x_lock_func_nowait(
+/*===========================*/
+ rw_lock_t* lock, /*!< in: pointer to rw-lock */
+ const char* file_name,/*!< in: file name where lock
+ requested */
+ ulint line) /*!< in: line where requested */
+{
+ struct PSI_rwlock_locker* locker = NULL;
+ ibool ret;
+
+ /* Record the entry of rw x lock request in performance schema */
+ if (UNIV_LIKELY(PSI_server && lock->pfs_psi)) {
+ locker = PSI_server->get_thread_rwlock_locker(
+ lock->pfs_psi, PSI_RWLOCK_WRITELOCK);
+
+ if (locker) {
+ PSI_server->start_rwlock_wrwait(locker,
+ file_name, line);
+ }
+ }
+
+ ret = rw_lock_x_lock_func_nowait(lock, file_name, line);
+
+ if (locker) {
+ PSI_server->end_rwlock_wrwait(locker, 0);
+ }
+
+ return(ret);
+}
+/******************************************************************//**
+Performance schema instrumented wrap function for rw_lock_free_func()
+NOTE! Please use the corresponding macro rw_lock_free(), not directly
+this function! */
+UNIV_INLINE
+void
+pfs_rw_lock_free_func(
+/*==================*/
+ rw_lock_t* lock) /*!< in: pointer to rw-lock */
+{
+ if (UNIV_LIKELY(PSI_server && lock->pfs_psi)) {
+ PSI_server->destroy_rwlock(lock->pfs_psi);
+ lock->pfs_psi = NULL;
+ }
+
+ rw_lock_free_func(lock);
+}
+/******************************************************************//**
+Performance schema instrumented wrap function for rw_lock_s_lock_func()
+NOTE! Please use the corresponding macro rw_lock_s_lock(), not
+directly this function! */
+UNIV_INLINE
+void
+pfs_rw_lock_s_lock_func(
+/*====================*/
+ rw_lock_t* lock, /*!< in: pointer to rw-lock */
+ ulint pass, /*!< in: pass value; != 0, if the
+ lock will be passed to another
+ thread to unlock */
+ const char* file_name,/*!< in: file name where lock
+ requested */
+ ulint line) /*!< in: line where requested */
+{
+ struct PSI_rwlock_locker* locker = NULL;
+
+ /* Instrumented to inform we are aquiring a shared rwlock */
+ if (UNIV_LIKELY(PSI_server && lock->pfs_psi)) {
+ locker = PSI_server->get_thread_rwlock_locker(
+ lock->pfs_psi, PSI_RWLOCK_READLOCK);
+ if (locker) {
+ PSI_server->start_rwlock_rdwait(locker,
+ file_name, line);
+ }
+ }
+
+ rw_lock_s_lock_func(lock, pass, file_name, line);
+
+ if (locker) {
+ PSI_server->end_rwlock_rdwait(locker, 0);
+ }
+}
+/******************************************************************//**
+Performance schema instrumented wrap function for rw_lock_s_lock_func()
+NOTE! Please use the corresponding macro rw_lock_s_lock(), not
+directly this function!
+@return TRUE if success */
+UNIV_INLINE
+ibool
+pfs_rw_lock_s_lock_low(
+/*===================*/
+ rw_lock_t* lock, /*!< in: pointer to rw-lock */
+ ulint pass, /*!< in: pass value; != 0, if the
+ lock will be passed to another
+ thread to unlock */
+ const char* file_name, /*!< in: file name where lock requested */
+ ulint line) /*!< in: line where requested */
+{
+
+ struct PSI_rwlock_locker* locker = NULL;
+ ibool ret;
+
+ /* Instrumented to inform we are aquiring a shared rwlock */
+ if (UNIV_LIKELY(PSI_server && lock->pfs_psi)) {
+ locker = PSI_server->get_thread_rwlock_locker(
+ lock->pfs_psi, PSI_RWLOCK_READLOCK);
+ if (locker) {
+ PSI_server->start_rwlock_rdwait(locker,
+ file_name, line);
+ }
+ }
+
+ ret = rw_lock_s_lock_low(lock, pass, file_name, line);
+
+ if (locker) {
+ PSI_server->end_rwlock_rdwait(locker, 0);
+ }
+
+ return(ret);
+}
+
+/******************************************************************//**
+Performance schema instrumented wrap function for rw_lock_x_unlock_func()
+NOTE! Please use the corresponding macro rw_lock_x_unlock(), not directly
+this function! */
+UNIV_INLINE
+void
+pfs_rw_lock_x_unlock_func(
+/*======================*/
+#ifdef UNIV_SYNC_DEBUG
+ ulint pass, /*!< in: pass value; != 0, if the
+ lock may have been passed to another
+ thread to unlock */
+#endif
+ rw_lock_t* lock) /*!< in/out: rw-lock */
+{
+ /* Inform performance schema we are unlocking the lock */
+ if (UNIV_LIKELY(PSI_server && lock->pfs_psi)) {
+ struct PSI_thread* thread;
+ thread = PSI_server->get_thread();
+ if (thread) {
+ PSI_server->unlock_rwlock(thread, lock->pfs_psi);
+ }
+ }
+
+ rw_lock_x_unlock_func(
+#ifdef UNIV_SYNC_DEBUG
+ pass,
+#endif
+ lock);
+}
+
+/******************************************************************//**
+Performance schema instrumented wrap function for rw_lock_s_unlock_func()
+NOTE! Please use the corresponding macro pfs_rw_lock_s_unlock(), not
+directly this function! */
+UNIV_INLINE
+void
+pfs_rw_lock_s_unlock_func(
+/*======================*/
+#ifdef UNIV_SYNC_DEBUG
+ ulint pass, /*!< in: pass value; != 0, if the
+ lock may have been passed to another
+ thread to unlock */
+#endif
+ rw_lock_t* lock) /*!< in/out: rw-lock */
+{
+ /* Inform performance schema we are unlocking the lock */
+ if (UNIV_LIKELY(PSI_server && lock->pfs_psi)) {
+ struct PSI_thread* thread;
+ thread = PSI_server->get_thread();
+ if (thread) {
+ PSI_server->unlock_rwlock(thread, lock->pfs_psi);
+ }
+ }
+
+ rw_lock_s_unlock_func(
+#ifdef UNIV_SYNC_DEBUG
+ pass,
+#endif
+ lock);
+
+}
+#endif /* UNIV_PFS_RWLOCK */
diff --git a/storage/innobase/include/sync0sync.h b/storage/innobase/include/sync0sync.h
index df990823cc4..69c0382d5b9 100644
--- a/storage/innobase/include/sync0sync.h
+++ b/storage/innobase/include/sync0sync.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
Copyright (c) 2008, Google Inc.
Portions of this file contain modifications contributed and copyrighted by
@@ -52,6 +52,69 @@ typedef LONG lock_word_t; /*!< On Windows, InterlockedExchange operates
typedef byte lock_word_t;
#endif
+#if defined UNIV_PFS_MUTEX || defined UNIV_PFS_RWLOCK
+/* There are mutexes/rwlocks that we want to exclude from
+instrumentation even if their corresponding performance schema
+define is set. And this PFS_NOT_INSTRUMENTED is used
+as the key value to dentify those objects that would
+be excluded from instrumentation. */
+# define PFS_NOT_INSTRUMENTED ULINT32_UNDEFINED
+
+# define PFS_IS_INSTRUMENTED(key) ((key) != PFS_NOT_INSTRUMENTED)
+
+/* By default, buffer mutexes and rwlocks will be excluded from
+instrumentation due to their large number of instances. */
+# define PFS_SKIP_BUFFER_MUTEX_RWLOCK
+
+#endif /* UNIV_PFS_MUTEX || UNIV_PFS_RWLOCK */
+
+#ifdef UNIV_PFS_MUTEX
+/* Key defines to register InnoDB mutexes with performance schema */
+extern mysql_pfs_key_t autoinc_mutex_key;
+extern mysql_pfs_key_t btr_search_enabled_mutex_key;
+extern mysql_pfs_key_t buffer_block_mutex_key;
+extern mysql_pfs_key_t buf_pool_mutex_key;
+extern mysql_pfs_key_t buf_pool_zip_mutex_key;
+extern mysql_pfs_key_t cache_last_read_mutex_key;
+extern mysql_pfs_key_t dict_foreign_err_mutex_key;
+extern mysql_pfs_key_t dict_sys_mutex_key;
+extern mysql_pfs_key_t file_format_max_mutex_key;
+extern mysql_pfs_key_t fil_system_mutex_key;
+extern mysql_pfs_key_t flush_list_mutex_key;
+extern mysql_pfs_key_t flush_order_mutex_key;
+extern mysql_pfs_key_t hash_table_mutex_key;
+extern mysql_pfs_key_t ibuf_bitmap_mutex_key;
+extern mysql_pfs_key_t ibuf_mutex_key;
+extern mysql_pfs_key_t ibuf_pessimistic_insert_mutex_key;
+extern mysql_pfs_key_t ios_mutex_key;
+extern mysql_pfs_key_t log_sys_mutex_key;
+extern mysql_pfs_key_t kernel_mutex_key;
+# ifdef UNIV_MEM_DEBUG
+extern mysql_pfs_key_t mem_hash_mutex_key;
+# endif /* UNIV_MEM_DEBUG */
+extern mysql_pfs_key_t mem_pool_mutex_key;
+extern mysql_pfs_key_t mutex_list_mutex_key;
+extern mysql_pfs_key_t purge_sys_mutex_key;
+extern mysql_pfs_key_t recv_sys_mutex_key;
+extern mysql_pfs_key_t rseg_mutex_key;
+# ifdef UNIV_SYNC_DEBUG
+extern mysql_pfs_key_t rw_lock_debug_mutex_key;
+# endif /* UNIV_SYNC_DEBUG */
+extern mysql_pfs_key_t rw_lock_list_mutex_key;
+extern mysql_pfs_key_t rw_lock_mutex_key;
+extern mysql_pfs_key_t srv_dict_tmpfile_mutex_key;
+extern mysql_pfs_key_t srv_innodb_monitor_mutex_key;
+extern mysql_pfs_key_t srv_misc_tmpfile_mutex_key;
+extern mysql_pfs_key_t srv_monitor_file_mutex_key;
+extern mysql_pfs_key_t syn_arr_mutex_key;
+# ifdef UNIV_SYNC_DEBUG
+extern mysql_pfs_key_t sync_thread_mutex_key;
+# endif /* UNIV_SYNC_DEBUG */
+extern mysql_pfs_key_t trx_doublewrite_mutex_key;
+extern mysql_pfs_key_t thr_local_mutex_key;
+extern mysql_pfs_key_t trx_undo_mutex_key;
+#endif /* UNIV_PFS_MUTEX */
+
/******************************************************************//**
Initializes the synchronization data structures. */
UNIV_INTERN
@@ -64,24 +127,82 @@ UNIV_INTERN
void
sync_close(void);
/*===========*/
+
+#undef mutex_free /* Fix for MacOS X */
+
+#ifdef UNIV_PFS_MUTEX
+/**********************************************************************
+Following mutex APIs would be performance schema instrumented
+if "UNIV_PFS_MUTEX" is defined:
+
+mutex_create
+mutex_enter
+mutex_exit
+mutex_enter_nowait
+mutex_free
+
+These mutex APIs will point to corresponding wrapper functions that contain
+the performance schema instrumentation if "UNIV_PFS_MUTEX" is defined.
+The instrumented wrapper functions have the prefix of "innodb_".
+
+NOTE! The following macro should be used in mutex operation, not the
+corresponding function. */
+
/******************************************************************//**
Creates, or rather, initializes a mutex object to a specified memory
location (which must be appropriately aligned). The mutex is initialized
in the reset state. Explicit freeing of the mutex with mutex_free is
necessary only if the memory block containing it is freed. */
+# ifdef UNIV_DEBUG
+# ifdef UNIV_SYNC_DEBUG
+# define mutex_create(K, M, level) \
+ pfs_mutex_create_func((K), (M), #M, (level), __FILE__, __LINE__)
+# else
+# define mutex_create(K, M, level) \
+ pfs_mutex_create_func((K), (M), #M, __FILE__, __LINE__)
+# endif/* UNIV_SYNC_DEBUG */
+# else
+# define mutex_create(K, M, level) \
+ pfs_mutex_create_func((K), (M), __FILE__, __LINE__)
+# endif /* UNIV_DEBUG */
-#ifdef UNIV_DEBUG
-# ifdef UNIV_SYNC_DEBUG
-# define mutex_create(M, level) \
+# define mutex_enter(M) \
+ pfs_mutex_enter_func((M), __FILE__, __LINE__)
+
+# define mutex_enter_nowait(M) \
+ pfs_mutex_enter_nowait_func((M), __FILE__, __LINE__)
+
+# define mutex_exit(M) pfs_mutex_exit_func(M)
+
+# define mutex_free(M) pfs_mutex_free_func(M)
+
+#else /* UNIV_PFS_MUTEX */
+
+/* If "UNIV_PFS_MUTEX" is not defined, the mutex APIs point to
+original non-instrumented functions */
+# ifdef UNIV_DEBUG
+# ifdef UNIV_SYNC_DEBUG
+# define mutex_create(K, M, level) \
mutex_create_func((M), #M, (level), __FILE__, __LINE__)
-# else
-# define mutex_create(M, level) \
+# else /* UNIV_SYNC_DEBUG */
+# define mutex_create(K, M, level) \
mutex_create_func((M), #M, __FILE__, __LINE__)
-# endif
-#else
-# define mutex_create(M, level) \
+# endif /* UNIV_SYNC_DEBUG */
+# else /* UNIV_DEBUG */
+# define mutex_create(K, M, level) \
mutex_create_func((M), __FILE__, __LINE__)
-#endif
+# endif /* UNIV_DEBUG */
+
+# define mutex_enter(M) mutex_enter_func((M), __FILE__, __LINE__)
+
+# define mutex_enter_nowait(M) \
+ mutex_enter_nowait_func((M), __FILE__, __LINE__)
+
+# define mutex_exit(M) mutex_exit_func(M)
+
+# define mutex_free(M) mutex_free_func(M)
+
+#endif /* UNIV_PFS_MUTEX */
/******************************************************************//**
Creates, or rather, initializes a mutex object in a specified memory
@@ -102,26 +223,20 @@ mutex_create_func(
const char* cfile_name, /*!< in: file name where created */
ulint cline); /*!< in: file line where created */
-#undef mutex_free /* Fix for MacOS X */
-
/******************************************************************//**
+NOTE! Use the corresponding macro mutex_free(), not directly this function!
Calling this function is obligatory only if the memory buffer containing
the mutex is freed. Removes a mutex object from the mutex list. The mutex
is checked to be in the reset state. */
UNIV_INTERN
void
-mutex_free(
-/*=======*/
+mutex_free_func(
+/*============*/
mutex_t* mutex); /*!< in: mutex */
/**************************************************************//**
NOTE! The following macro should be used in mutex locking, not the
corresponding function. */
-#define mutex_enter(M) mutex_enter_func((M), __FILE__, __LINE__)
-/**************************************************************//**
-NOTE! The following macro should be used in mutex locking, not the
-corresponding function. */
-
/* NOTE! currently same as mutex_enter! */
#define mutex_enter_fast(M) mutex_enter_func((M), __FILE__, __LINE__)
@@ -137,12 +252,6 @@ mutex_enter_func(
mutex_t* mutex, /*!< in: pointer to mutex */
const char* file_name, /*!< in: file name where locked */
ulint line); /*!< in: line where locked */
-/**************************************************************//**
-NOTE! The following macro should be used in mutex locking, not the
-corresponding function. */
-
-#define mutex_enter_nowait(M) \
- mutex_enter_nowait_func((M), __FILE__, __LINE__)
/********************************************************************//**
NOTE! Use the corresponding macro in the header file, not this function
directly. Tries to lock the mutex for the current thread. If the lock is not
@@ -157,12 +266,86 @@ mutex_enter_nowait_func(
requested */
ulint line); /*!< in: line where requested */
/******************************************************************//**
+NOTE! Use the corresponding macro mutex_exit(), not directly this function!
Unlocks a mutex owned by the current thread. */
UNIV_INLINE
void
-mutex_exit(
-/*=======*/
+mutex_exit_func(
+/*============*/
mutex_t* mutex); /*!< in: pointer to mutex */
+
+
+#ifdef UNIV_PFS_MUTEX
+/******************************************************************//**
+NOTE! Please use the corresponding macro mutex_create(), not directly
+this function!
+A wrapper function for mutex_create_func(), registers the mutex
+with peformance schema if "UNIV_PFS_MUTEX" is defined when
+creating the mutex */
+UNIV_INLINE
+void
+pfs_mutex_create_func(
+/*==================*/
+ PSI_mutex_key key, /*!< in: Performance Schema key */
+ mutex_t* mutex, /*!< in: pointer to memory */
+# ifdef UNIV_DEBUG
+ const char* cmutex_name, /*!< in: mutex name */
+# ifdef UNIV_SYNC_DEBUG
+ ulint level, /*!< in: level */
+# endif /* UNIV_SYNC_DEBUG */
+# endif /* UNIV_DEBUG */
+ const char* cfile_name, /*!< in: file name where created */
+ ulint cline); /*!< in: file line where created */
+/******************************************************************//**
+NOTE! Please use the corresponding macro mutex_enter(), not directly
+this function!
+This is a performance schema instrumented wrapper function for
+mutex_enter_func(). */
+UNIV_INLINE
+void
+pfs_mutex_enter_func(
+/*=================*/
+ mutex_t* mutex, /*!< in: pointer to mutex */
+ const char* file_name, /*!< in: file name where locked */
+ ulint line); /*!< in: line where locked */
+/********************************************************************//**
+NOTE! Please use the corresponding macro mutex_enter_nowait(), not directly
+this function!
+This is a performance schema instrumented wrapper function for
+mutex_enter_nowait_func.
+@return 0 if succeed, 1 if not */
+UNIV_INLINE
+ulint
+pfs_mutex_enter_nowait_func(
+/*========================*/
+ mutex_t* mutex, /*!< in: pointer to mutex */
+ const char* file_name, /*!< in: file name where mutex
+ requested */
+ ulint line); /*!< in: line where requested */
+/******************************************************************//**
+NOTE! Please use the corresponding macro mutex_exit(), not directly
+this function!
+A wrap function of mutex_exit_func() with peformance schema instrumentation.
+Unlocks a mutex owned by the current thread. */
+UNIV_INLINE
+void
+pfs_mutex_exit_func(
+/*================*/
+ mutex_t* mutex); /*!< in: pointer to mutex */
+
+/******************************************************************//**
+NOTE! Please use the corresponding macro mutex_free(), not directly
+this function!
+Wrapper function for mutex_free_func(). Also destroys the performance
+schema probes when freeing the mutex */
+UNIV_INLINE
+void
+pfs_mutex_free_func(
+/*================*/
+ mutex_t* mutex); /*!< in: mutex */
+
+#endif /* UNIV_PFS_MUTEX */
+
#ifdef UNIV_SYNC_DEBUG
/******************************************************************//**
Returns TRUE if no mutex or rw-lock is currently locked.
@@ -206,7 +389,8 @@ UNIV_INTERN
ibool
mutex_own(
/*======*/
- const mutex_t* mutex); /*!< in: mutex */
+ const mutex_t* mutex) /*!< in: mutex */
+ __attribute__((warn_unused_result));
#endif /* UNIV_DEBUG */
#ifdef UNIV_SYNC_DEBUG
/******************************************************************//**
@@ -238,16 +422,27 @@ ibool
sync_thread_levels_empty(void);
/*==========================*/
/******************************************************************//**
-Checks that the level array for the current thread is empty.
-@return TRUE if empty except the exceptions specified below */
+Checks if the level array for the current thread contains a
+mutex or rw-latch at the specified level.
+@return a matching latch, or NULL if not found */
UNIV_INTERN
-ibool
-sync_thread_levels_empty_gen(
-/*=========================*/
+void*
+sync_thread_levels_contains(
+/*========================*/
+ ulint level); /*!< in: latching order level
+ (SYNC_DICT, ...)*/
+/******************************************************************//**
+Checks if the level array for the current thread is empty.
+@return a latch, or NULL if empty except the exceptions specified below */
+UNIV_INTERN
+void*
+sync_thread_levels_nonempty_gen(
+/*============================*/
ibool dict_mutex_allowed); /*!< in: TRUE if dictionary mutex is
allowed to be owned by the thread,
also purge_is_running mutex is
allowed */
+#define sync_thread_levels_empty_gen(d) (!sync_thread_levels_nonempty_gen(d))
/******************************************************************//**
Gets the debug information for a reserved mutex. */
UNIV_INTERN
@@ -475,8 +670,10 @@ or row lock! */
SYNC_SEARCH_SYS, as memory allocation
can call routines there! Otherwise
the level is SYNC_MEM_HASH. */
-#define SYNC_BUF_POOL 150
-#define SYNC_BUF_BLOCK 149
+#define SYNC_BUF_POOL 150 /* Buffer pool mutex */
+#define SYNC_BUF_FLUSH_ORDER 147
+#define SYNC_BUF_BLOCK 146 /* Block mutex */
+#define SYNC_BUF_FLUSH_LIST 145 /* Buffer flush list mutex */
#define SYNC_DOUBLEWRITE 140
#define SYNC_ANY_LATCH 135
#define SYNC_THR_LOCAL 133
@@ -538,6 +735,10 @@ struct mutex_struct {
const char* cmutex_name; /*!< mutex name */
ulint mutex_type; /*!< 0=usual mutex, 1=rw_lock mutex */
#endif /* UNIV_DEBUG */
+#ifdef UNIV_PFS_MUTEX
+ struct PSI_mutex* pfs_psi; /*!< The performance schema
+ instrumentation hook */
+#endif
};
/** The global array of wait cells for implementation of the databases own
diff --git a/storage/innobase/include/sync0sync.ic b/storage/innobase/include/sync0sync.ic
index b05020b5660..fdd70ad052f 100644
--- a/storage/innobase/include/sync0sync.ic
+++ b/storage/innobase/include/sync0sync.ic
@@ -152,11 +152,12 @@ mutex_get_waiters(
}
/******************************************************************//**
+NOTE! Use the corresponding macro mutex_exit(), not directly this function!
Unlocks a mutex owned by the current thread. */
UNIV_INLINE
void
-mutex_exit(
-/*=======*/
+mutex_exit_func(
+/*============*/
mutex_t* mutex) /*!< in: pointer to mutex */
{
ut_ad(mutex_own(mutex));
@@ -220,3 +221,148 @@ mutex_enter_func(
mutex_spin_wait(mutex, file_name, line);
}
+
+#ifdef UNIV_PFS_MUTEX
+/******************************************************************//**
+NOTE! Please use the corresponding macro mutex_enter(), not directly
+this function!
+This is a performance schema instrumented wrapper function for
+mutex_enter_func(). */
+UNIV_INLINE
+void
+pfs_mutex_enter_func(
+/*=================*/
+ mutex_t* mutex, /*!< in: pointer to mutex */
+ const char* file_name, /*!< in: file name where locked */
+ ulint line) /*!< in: line where locked */
+{
+ struct PSI_mutex_locker* locker = NULL;
+ int result = 0;
+
+ if (UNIV_LIKELY(PSI_server && mutex->pfs_psi)) {
+ locker = PSI_server->get_thread_mutex_locker(
+ mutex->pfs_psi, PSI_MUTEX_LOCK);
+ if (locker) {
+ PSI_server->start_mutex_wait(locker, file_name, line);
+ }
+ }
+
+ mutex_enter_func(mutex, file_name, line);
+
+ if (locker) {
+ PSI_server->end_mutex_wait(locker, result);
+ }
+}
+/********************************************************************//**
+NOTE! Please use the corresponding macro mutex_enter_nowait(), not directly
+this function!
+This is a performance schema instrumented wrapper function for
+mutex_enter_nowait_func.
+@return 0 if succeed, 1 if not */
+UNIV_INLINE
+ulint
+pfs_mutex_enter_nowait_func(
+/*========================*/
+ mutex_t* mutex, /*!< in: pointer to mutex */
+ const char* file_name, /*!< in: file name where mutex
+ requested */
+ ulint line) /*!< in: line where requested */
+{
+ ulint ret;
+ struct PSI_mutex_locker* locker = NULL;
+ int result = 0;
+
+ if (UNIV_LIKELY(PSI_server && mutex->pfs_psi)) {
+ locker = PSI_server->get_thread_mutex_locker(
+ mutex->pfs_psi, PSI_MUTEX_LOCK);
+ if (locker) {
+ PSI_server->start_mutex_wait(locker, file_name, line);
+ }
+ }
+
+ ret = mutex_enter_nowait_func(mutex, file_name, line);
+
+ if (locker) {
+ PSI_server->end_mutex_wait(locker, result);
+ }
+
+ return(ret);
+}
+/******************************************************************//**
+NOTE! Please use the corresponding macro mutex_exit(), not directly
+this function!
+A wrap function of mutex_exit_func() with performance schema instrumentation.
+Unlocks a mutex owned by the current thread. */
+UNIV_INLINE
+void
+pfs_mutex_exit_func(
+/*================*/
+ mutex_t* mutex) /*!< in: pointer to mutex */
+{
+ if (UNIV_LIKELY(PSI_server && mutex->pfs_psi)) {
+ struct PSI_thread* thread;
+ thread = PSI_server->get_thread();
+
+ if (thread) {
+ PSI_server->unlock_mutex(thread, mutex->pfs_psi);
+ }
+ }
+
+ mutex_exit_func(mutex);
+}
+
+/******************************************************************//**
+NOTE! Please use the corresponding macro mutex_create(), not directly
+this function!
+A wrapper function for mutex_create_func(), registers the mutex
+with performance schema if "UNIV_PFS_MUTEX" is defined when
+creating the mutex */
+UNIV_INLINE
+void
+pfs_mutex_create_func(
+/*==================*/
+ mysql_pfs_key_t key, /*!< in: Performance Schema key */
+ mutex_t* mutex, /*!< in: pointer to memory */
+# ifdef UNIV_DEBUG
+ const char* cmutex_name, /*!< in: mutex name */
+# ifdef UNIV_SYNC_DEBUG
+ ulint level, /*!< in: level */
+# endif /* UNIV_SYNC_DEBUG */
+# endif /* UNIV_DEBUG */
+ const char* cfile_name, /*!< in: file name where created */
+ ulint cline) /*!< in: file line where created */
+{
+ mutex->pfs_psi = (PSI_server && PFS_IS_INSTRUMENTED(key))
+ ? PSI_server->init_mutex(key, mutex)
+ : NULL;
+
+ mutex_create_func(mutex,
+# ifdef UNIV_DEBUG
+ cmutex_name,
+# ifdef UNIV_SYNC_DEBUG
+ level,
+# endif /* UNIV_SYNC_DEBUG */
+# endif /* UNIV_DEBUG */
+ cfile_name,
+ cline);
+}
+/******************************************************************//**
+NOTE! Please use the corresponding macro mutex_free(), not directly
+this function!
+Wrapper function for mutex_free_func(). Also destroys the performance
+schema probes when freeing the mutex */
+UNIV_INLINE
+void
+pfs_mutex_free_func(
+/*===================*/
+ mutex_t* mutex) /*!< in: mutex */
+{
+ if (UNIV_LIKELY(PSI_server && mutex->pfs_psi)) {
+ PSI_server->destroy_mutex(mutex->pfs_psi);
+ mutex->pfs_psi= NULL;
+ }
+
+ mutex_free_func(mutex);
+}
+
+#endif /* UNIV_PFS_MUTEX */
diff --git a/storage/innobase/include/trx0purge.h b/storage/innobase/include/trx0purge.h
index 908760580f6..d2730a68a78 100644
--- a/storage/innobase/include/trx0purge.h
+++ b/storage/innobase/include/trx0purge.h
@@ -112,8 +112,10 @@ This function runs a purge batch.
@return number of undo log pages handled in the batch */
UNIV_INTERN
ulint
-trx_purge(void);
-/*===========*/
+trx_purge(
+/*======*/
+ ulint limit); /*!< in: the maximum number of records to
+ purge in one batch */
/******************************************************************//**
Prints information of the purge system to stderr. */
UNIV_INTERN
diff --git a/storage/innobase/include/trx0rseg.h b/storage/innobase/include/trx0rseg.h
index ba1fc88b6c4..78a7a8c4bb0 100644
--- a/storage/innobase/include/trx0rseg.h
+++ b/storage/innobase/include/trx0rseg.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -103,7 +103,7 @@ trx_rseg_header_create(
ulint zip_size, /*!< in: compressed page size in bytes
or 0 for uncompressed pages */
ulint max_size, /*!< in: max size in pages */
- ulint* slot_no, /*!< out: rseg id == slot number in trx sys */
+ ulint rseg_slot_no, /*!< in: rseg id == slot number in trx sys */
mtr_t* mtr); /*!< in: mtr */
/*********************************************************************//**
Creates the memory copies for rollback segments and initializes the
@@ -114,17 +114,6 @@ trx_rseg_list_and_array_init(
/*=========================*/
trx_sysf_t* sys_header, /*!< in: trx system header */
mtr_t* mtr); /*!< in: mtr */
-/****************************************************************//**
-Creates a new rollback segment to the database.
-@return the created segment object, NULL if fail */
-UNIV_INTERN
-trx_rseg_t*
-trx_rseg_create(
-/*============*/
- ulint space, /*!< in: space id */
- ulint max_size, /*!< in: max size in pages */
- ulint* id, /*!< out: rseg id */
- mtr_t* mtr); /*!< in: mtr */
/***************************************************************************
Free's an instance of the rollback segment in memory. */
UNIV_INTERN
@@ -133,6 +122,12 @@ trx_rseg_mem_free(
/*==============*/
trx_rseg_t* rseg); /* in, own: instance to free */
+/*********************************************************************
+Creates a rollback segment. */
+UNIV_INTERN
+trx_rseg_t*
+trx_rseg_create(void);
+/*==================*/
/* Number of undo log slots in a rollback segment file copy */
#define TRX_RSEG_N_SLOTS (UNIV_PAGE_SIZE / 16)
diff --git a/storage/innobase/include/trx0sys.h b/storage/innobase/include/trx0sys.h
index a53296a06d9..fc92b4317d5 100644
--- a/storage/innobase/include/trx0sys.h
+++ b/storage/innobase/include/trx0sys.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -333,12 +333,14 @@ UNIV_INTERN
void
trx_sys_file_format_tag_init(void);
/*==============================*/
+#ifndef UNIV_HOTBACKUP
/*****************************************************************//**
Shutdown/Close the transaction system. */
UNIV_INTERN
void
trx_sys_close(void);
/*===============*/
+#endif /* !UNIV_HOTBACKUP */
/*****************************************************************//**
Get the name representation of the file format from its id.
@return pointer to the name */
@@ -429,6 +431,14 @@ trx_sys_file_format_id_to_name(
const ulint id); /*!< in: id of the file format */
#endif /* !UNIV_HOTBACKUP */
+/*********************************************************************
+Creates the rollback segments */
+UNIV_INTERN
+void
+trx_sys_create_rsegs(
+/*=================*/
+ ulint n_rsegs); /*!< number of rollback segments to create */
+
/* The automatically created system rollback segment has this id */
#define TRX_SYS_SYSTEM_RSEG_ID 0
@@ -463,11 +473,16 @@ trx_sys_file_format_id_to_name(
slots */
/*------------------------------------------------------------- @} */
-/** Maximum number of rollback segments: the number of segment
-specification slots in the transaction system array; rollback segment
-id must fit in one byte, therefore 256; each slot is currently 8 bytes
-in size */
-#define TRX_SYS_N_RSEGS 256
+/* Max number of rollback segments: the number of segment specification slots
+in the transaction system array; rollback segment id must fit in one (signed)
+byte, therefore 128; each slot is currently 8 bytes in size. If you want
+to raise the level to 256 then you will need to fix some assertions that
+impose the 7 bit restriction. e.g., mach_write_to_3() */
+#define TRX_SYS_N_RSEGS 128
+/* Originally, InnoDB defined TRX_SYS_N_RSEGS as 256 but created only one
+rollback segment. It initialized some arrays with this number of entries.
+We must remember this limit in order to keep file compatibility. */
+#define TRX_SYS_OLD_N_RSEGS 256
/** Maximum length of MySQL binlog file name, in bytes.
@see trx_sys_mysql_master_log_name
@@ -495,7 +510,6 @@ this contains the same fields as TRX_SYS_MYSQL_LOG_INFO below */
within that file */
#define TRX_SYS_MYSQL_LOG_NAME 12 /*!< MySQL log file name */
-#ifndef UNIV_HOTBACKUP
/** Doublewrite buffer */
/* @{ */
/** The offset of the doublewrite buffer header on the trx system header page */
@@ -547,6 +561,7 @@ FIL_PAGE_ARCH_LOG_NO_OR_SPACE_NO. */
#define TRX_SYS_DOUBLEWRITE_BLOCK_SIZE FSP_EXTENT_SIZE
/* @} */
+#ifndef UNIV_HOTBACKUP
/** File format tag */
/* @{ */
/** The offset of the file format tag on the trx system header page
diff --git a/storage/innobase/include/trx0trx.h b/storage/innobase/include/trx0trx.h
index 5f2c1246f37..480f265a138 100644
--- a/storage/innobase/include/trx0trx.h
+++ b/storage/innobase/include/trx0trx.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -349,7 +349,7 @@ trx_print(
use the default max length */
/** Type of data dictionary operation */
-enum trx_dict_op {
+typedef enum trx_dict_op {
/** The transaction is not modifying the data dictionary. */
TRX_DICT_OP_NONE = 0,
/** The transaction is creating a table or an index, or
@@ -361,7 +361,7 @@ enum trx_dict_op {
existing table. In crash recovery, the data dictionary
must be locked, but the table must not be dropped. */
TRX_DICT_OP_INDEX = 2
-};
+} trx_dict_op_t;
/**********************************************************************//**
Determine if a transaction is a dictionary operation.
@@ -463,69 +463,79 @@ rolling back after a database recovery */
struct trx_struct{
ulint magic_n;
- /* All the next fields are protected by the kernel mutex, except the
- undo logs which are protected by undo_mutex */
+
+ /* These fields are not protected by any mutex. */
const char* op_info; /*!< English text describing the
current operation, or an empty
string */
- unsigned is_purge:1; /*!< 0=user transaction, 1=purge */
- unsigned is_recovered:1; /*!< 0=normal transaction,
- 1=recovered, must be rolled back */
- unsigned conc_state:2; /*!< state of the trx from the point
+ ulint conc_state; /*!< state of the trx from the point
of view of concurrency control:
TRX_ACTIVE, TRX_COMMITTED_IN_MEMORY,
... */
- unsigned que_state:2; /*!< valid when conc_state == TRX_ACTIVE:
- TRX_QUE_RUNNING, TRX_QUE_LOCK_WAIT,
- ... */
- unsigned isolation_level:2;/* TRX_ISO_REPEATABLE_READ, ... */
- unsigned check_foreigns:1;/* normally TRUE, but if the user
+ ulint isolation_level;/* TRX_ISO_REPEATABLE_READ, ... */
+ ulint check_foreigns; /* normally TRUE, but if the user
wants to suppress foreign key checks,
(in table imports, for example) we
set this FALSE */
- unsigned check_unique_secondary:1;
+ ulint check_unique_secondary;
/* normally TRUE, but if the user
wants to speed up inserts by
suppressing unique key checks
for secondary indexes when we decide
if we can use the insert buffer for
them, we set this FALSE */
- unsigned support_xa:1; /*!< normally we do the XA two-phase
+ ulint support_xa; /*!< normally we do the XA two-phase
commit steps, but by setting this to
FALSE, one can save CPU time and about
150 bytes in the undo log size as then
we skip XA steps */
- unsigned flush_log_later:1;/* In 2PC, we hold the
+ ulint flush_log_later;/* In 2PC, we hold the
prepare_commit mutex across
both phases. In that case, we
defer flush of the logs to disk
until after we release the
mutex. */
- unsigned must_flush_log_later:1;/* this flag is set to TRUE in
+ ulint must_flush_log_later;/* this flag is set to TRUE in
trx_commit_off_kernel() if
flush_log_later was TRUE, and there
were modifications by the transaction;
in that case we must flush the log
in trx_commit_complete_for_mysql() */
- unsigned dict_operation:2;/**< @see enum trx_dict_op */
- unsigned duplicates:2; /*!< TRX_DUP_IGNORE | TRX_DUP_REPLACE */
- unsigned active_trans:2; /*!< 1 - if a transaction in MySQL
+ ulint duplicates; /*!< TRX_DUP_IGNORE | TRX_DUP_REPLACE */
+ ulint active_trans; /*!< 1 - if a transaction in MySQL
is active. 2 - if prepare_commit_mutex
was taken */
- unsigned has_search_latch:1;
+ ulint has_search_latch;
/* TRUE if this trx has latched the
search system latch in S-mode */
- unsigned declared_to_be_inside_innodb:1;
+ ulint deadlock_mark; /*!< a mark field used in deadlock
+ checking algorithm. */
+ trx_dict_op_t dict_operation; /**< @see enum trx_dict_op */
+
+ /* Fields protected by the srv_conc_mutex. */
+ ulint declared_to_be_inside_innodb;
/* this is TRUE if we have declared
this transaction in
srv_conc_enter_innodb to be inside the
InnoDB engine */
- unsigned handling_signals:1;/* this is TRUE as long as the trx
- is handling signals */
- unsigned dict_operation_lock_mode:2;
- /* 0, RW_S_LATCH, or RW_X_LATCH:
+
+ /* Fields protected by dict_operation_lock. The very latch
+ it is used to track. */
+ ulint dict_operation_lock_mode;
+ /*!< 0, RW_S_LATCH, or RW_X_LATCH:
the latch mode trx currently holds
on dict_operation_lock */
+
+ /* All the next fields are protected by the kernel mutex, except the
+ undo logs which are protected by undo_mutex */
+ ulint is_purge; /*!< 0=user transaction, 1=purge */
+ ulint is_recovered; /*!< 0=normal transaction,
+ 1=recovered, must be rolled back */
+ ulint que_state; /*!< valid when conc_state
+ == TRX_ACTIVE: TRX_QUE_RUNNING,
+ TRX_QUE_LOCK_WAIT, ... */
+ ulint handling_signals;/* this is TRUE as long as the trx
+ is handling signals */
time_t start_time; /*!< time the trx object was created
or the state last time became
TRX_ACTIVE */
@@ -640,11 +650,6 @@ struct trx_struct{
wait_thrs; /*!< query threads belonging to this
trx that are in the QUE_THR_LOCK_WAIT
state */
- ulint deadlock_mark; /*!< a mark field used in deadlock
- checking algorithm. This must be
- in its own machine word, because
- it can be changed by other
- threads while holding kernel_mutex. */
/*------------------------------*/
mem_heap_t* lock_heap; /*!< memory heap for the locks of the
transaction */
diff --git a/storage/innobase/include/trx0types.h b/storage/innobase/include/trx0types.h
index 24cf57d53d5..40a7256cbfd 100644
--- a/storage/innobase/include/trx0types.h
+++ b/storage/innobase/include/trx0types.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -70,6 +70,13 @@ typedef struct trx_named_savept_struct trx_named_savept_t;
enum trx_rb_ctx {
RB_NONE = 0, /*!< no rollback */
RB_NORMAL, /*!< normal rollback */
+ RB_RECOVERY_PURGE_REC,
+ /*!< rolling back an incomplete transaction,
+ in crash recovery, rolling back an
+ INSERT that was performed by updating a
+ delete-marked record; if the delete-marked record
+ no longer exists in an active read view, it will
+ be purged */
RB_RECOVERY /*!< rolling back an incomplete transaction,
in crash recovery */
};
diff --git a/storage/innobase/include/trx0undo.ic b/storage/innobase/include/trx0undo.ic
index 2d289b34ef1..6502ee826e5 100644
--- a/storage/innobase/include/trx0undo.ic
+++ b/storage/innobase/include/trx0undo.ic
@@ -42,7 +42,7 @@ trx_undo_build_roll_ptr(
#if DATA_ROLL_PTR_LEN != 7
# error "DATA_ROLL_PTR_LEN != 7"
#endif
- ut_ad(rseg_id < 128);
+ ut_ad(rseg_id < TRX_SYS_N_RSEGS);
return(ut_dulint_create(is_insert * 128 * 256 * 256
+ rseg_id * 256 * 256
diff --git a/storage/innobase/include/univ.i b/storage/innobase/include/univ.i
index 2081e136590..ea0ad4e790c 100644
--- a/storage/innobase/include/univ.i
+++ b/storage/innobase/include/univ.i
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
Copyright (c) 2008, Google Inc.
Copyright (c) 2009, Sun Microsystems, Inc.
@@ -45,8 +45,8 @@ Created 1/20/1994 Heikki Tuuri
#endif /* UNIV_HOTBACKUP */
#define INNODB_VERSION_MAJOR 1
-#define INNODB_VERSION_MINOR 0
-#define INNODB_VERSION_BUGFIX 6
+#define INNODB_VERSION_MINOR 1
+#define INNODB_VERSION_BUGFIX 0
/* The following is the InnoDB version as shown in
SELECT plugin_version FROM information_schema.plugins;
@@ -144,6 +144,23 @@ Sun Studio */
#endif /* #if (defined(WIN32) || ... */
+/* Following defines are to enable performance schema
+instrumentation in each of four InnoDB modules if
+HAVE_PSI_INTERFACE is defined. */
+#ifdef HAVE_PSI_INTERFACE
+# define UNIV_PFS_MUTEX
+# define UNIV_PFS_RWLOCK
+/* For I/O instrumentation, performance schema rely
+on a native descriptor to identify the file, this
+descriptor could conflict with our OS level descriptor.
+Disable IO instrumentation on Windows until this is
+resolved */
+# ifndef __WIN__
+# define UNIV_PFS_IO
+# endif
+# define UNIV_PFS_THREAD
+#endif /* HAVE_PSI_INTERFACE */
+
/* DEBUG VERSION CONTROL
===================== */
@@ -208,6 +225,9 @@ operations (very slow); also UNIV_DEBUG must be defined */
for compressed pages */
#define UNIV_ZIP_COPY /* call page_zip_copy_recs()
more often */
+#define UNIV_AIO_DEBUG /* prints info about
+ submitted and reaped AIO
+ requests to the log. */
#endif
#define UNIV_BTR_DEBUG /* check B-tree links */
@@ -229,11 +249,6 @@ by one. */
/* the above option prevents forcing of log to disk
at a buffer page write: it should be tested with this
option off; also some ibuf tests are suppressed */
-/*
-#define UNIV_BASIC_LOG_DEBUG
-*/
- /* the above option enables basic recovery debugging:
- new allocated file pages are reset */
/* Linkage specifier for non-static InnoDB symbols (variables and functions)
that are only referenced from within InnoDB, not from MySQL */
diff --git a/storage/innobase/include/ut0lst.h b/storage/innobase/include/ut0lst.h
index 261d33963dc..bb295ea1b22 100644
--- a/storage/innobase/include/ut0lst.h
+++ b/storage/innobase/include/ut0lst.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -110,7 +110,7 @@ Adds the node as the last element in a two-way linked list.
*/
#define UT_LIST_ADD_LAST(NAME, BASE, N)\
{\
- ut_ad(N);\
+ ut_ad(N != NULL);\
((BASE).count)++;\
((N)->NAME).prev = (BASE).end;\
((N)->NAME).next = NULL;\
diff --git a/storage/innobase/include/ut0rbt.h b/storage/innobase/include/ut0rbt.h
new file mode 100644
index 00000000000..a35807be442
--- /dev/null
+++ b/storage/innobase/include/ut0rbt.h
@@ -0,0 +1,293 @@
+/******************************************************
+Red-Black tree implementation.
+(c) 2007 Oracle/Innobase Oy
+
+Created 2007-03-20 Sunny Bains
+*******************************************************/
+
+#ifndef INNOBASE_UT0RBT_H
+#define INNOBASE_UT0RBT_H
+
+#if !defined(IB_RBT_TESTING)
+#include "univ.i"
+#include "ut0mem.h"
+#else
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#define ut_malloc malloc
+#define ut_free free
+#define ulint unsigned long
+#define ut_a(c) assert(c)
+#define ut_error assert(0)
+#define ibool unsigned int
+#define TRUE 1
+#define FALSE 0
+#endif
+
+/* Red black tree typedefs */
+typedef struct ib_rbt_struct ib_rbt_t;
+typedef struct ib_rbt_node_struct ib_rbt_node_t;
+// FIXME: Iterator is a better name than _bound_
+typedef struct ib_rbt_bound_struct ib_rbt_bound_t;
+typedef void (*ib_rbt_print_node)(const ib_rbt_node_t* node);
+typedef int (*ib_rbt_compare)(const void* p1, const void* p2);
+
+/* Red black tree color types */
+enum ib_rbt_color_enum {
+ IB_RBT_RED,
+ IB_RBT_BLACK
+};
+
+typedef enum ib_rbt_color_enum ib_rbt_color_t;
+
+/* Red black tree node */
+struct ib_rbt_node_struct {
+ ib_rbt_color_t color; /* color of this node */
+
+ ib_rbt_node_t* left; /* points left child */
+ ib_rbt_node_t* right; /* points right child */
+ ib_rbt_node_t* parent; /* points parent node */
+
+ char value[1]; /* Data value */
+};
+
+/* Red black tree instance.*/
+struct ib_rbt_struct {
+ ib_rbt_node_t* nil; /* Black colored node that is
+ used as a sentinel. This is
+ pre-allocated too.*/
+
+ ib_rbt_node_t* root; /* Root of the tree, this is
+ pre-allocated and the first
+ data node is the left child.*/
+
+ ulint n_nodes; /* Total number of data nodes */
+
+ ib_rbt_compare compare; /* Fn. to use for comparison */
+ ulint sizeof_value; /* Sizeof the item in bytes */
+};
+
+/* The result of searching for a key in the tree, this is useful for
+a speedy lookup and insert if key doesn't exist.*/
+struct ib_rbt_bound_struct {
+ const ib_rbt_node_t*
+ last; /* Last node visited */
+
+ int result; /* Result of comparing with
+ the last non-nil node that
+ was visited */
+};
+
+/* Size in elements (t is an rb tree instance) */
+#define rbt_size(t) (t->n_nodes)
+
+/* Check whether the rb tree is empty (t is an rb tree instance) */
+#define rbt_empty(t) (rbt_size(t) == 0)
+
+/* Get data value (t is the data type, n is an rb tree node instance) */
+#define rbt_value(t, n) ((t*) &n->value[0])
+
+/* Compare a key with the node value (t is tree, k is key, n is node)*/
+#define rbt_compare(t, k, n) (t->compare(k, n->value))
+
+/************************************************************************
+Free an instance of a red black tree */
+UNIV_INTERN
+void
+rbt_free(
+/*=====*/
+ ib_rbt_t* tree); /*!< in: rb tree to free */
+/************************************************************************
+Create an instance of a red black tree
+@return rb tree instance */
+UNIV_INTERN
+ib_rbt_t*
+rbt_create(
+/*=======*/
+ size_t sizeof_value, /*!< in: size in bytes */
+ ib_rbt_compare compare); /*!< in: comparator */
+/************************************************************************
+Delete a node from the red black tree, identified by key */
+UNIV_INTERN
+ibool
+rbt_delete(
+/*=======*/
+ /* in: TRUE on success */
+ ib_rbt_t* tree, /* in: rb tree */
+ const void* key); /* in: key to delete */
+/************************************************************************
+Remove a node from the red black tree, NOTE: This function will not delete
+the node instance, THAT IS THE CALLERS RESPONSIBILITY.
+@return the deleted node with the const. */
+UNIV_INTERN
+ib_rbt_node_t*
+rbt_remove_node(
+/*============*/
+ ib_rbt_t* tree, /*!< in: rb tree */
+ const ib_rbt_node_t*
+ node); /*!< in: node to delete, this
+ is a fudge and declared const
+ because the caller has access
+ only to const nodes.*/
+/************************************************************************
+Return a node from the red black tree, identified by
+key, NULL if not found
+@return node if found else return NULL */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_lookup(
+/*=======*/
+ const ib_rbt_t* tree, /*!< in: rb tree to search */
+ const void* key); /*!< in: key to lookup */
+/************************************************************************
+Add data to the red black tree, identified by key (no dups yet!)
+@return inserted node */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_insert(
+/*=======*/
+ ib_rbt_t* tree, /*!< in: rb tree */
+ const void* key, /*!< in: key for ordering */
+ const void* value); /*!< in: data that will be
+ copied to the node.*/
+/************************************************************************
+Add a new node to the tree, useful for data that is pre-sorted.
+@return appended node */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_add_node(
+/*=========*/
+ ib_rbt_t* tree, /*!< in: rb tree */
+ ib_rbt_bound_t* parent, /*!< in: parent */
+ const void* value); /*!< in: this value is copied
+ to the node */
+/************************************************************************
+Return the left most data node in the tree
+@return left most node */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_first(
+/*======*/
+ const ib_rbt_t* tree); /*!< in: rb tree */
+/************************************************************************
+Return the right most data node in the tree
+@return right most node */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_last(
+/*=====*/
+ const ib_rbt_t* tree); /*!< in: rb tree */
+/************************************************************************
+Return the next node from current.
+@return successor node to current that is passed in. */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_next(
+/*=====*/
+ const ib_rbt_t* tree, /*!< in: rb tree */
+ const ib_rbt_node_t* /* in: current node */
+ current);
+/************************************************************************
+Return the prev node from current.
+@return precedessor node to current that is passed in */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_prev(
+/*=====*/
+ const ib_rbt_t* tree, /*!< in: rb tree */
+ const ib_rbt_node_t* /* in: current node */
+ current);
+/************************************************************************
+Find the node that has the lowest key that is >= key.
+@return node that satisfies the lower bound constraint or NULL */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_lower_bound(
+/*============*/
+ const ib_rbt_t* tree, /*!< in: rb tree */
+ const void* key); /*!< in: key to search */
+/************************************************************************
+Find the node that has the greatest key that is <= key.
+@return node that satisifies the upper bound constraint or NULL */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_upper_bound(
+/*============*/
+ const ib_rbt_t* tree, /*!< in: rb tree */
+ const void* key); /*!< in: key to search */
+/************************************************************************
+Search for the key, a node will be retuned in parent.last, whether it
+was found or not. If not found then parent.last will contain the
+parent node for the possibly new key otherwise the matching node.
+@return result of last comparison */
+UNIV_INTERN
+int
+rbt_search(
+/*=======*/
+ const ib_rbt_t* tree, /*!< in: rb tree */
+ ib_rbt_bound_t* parent, /*!< in: search bounds */
+ const void* key); /*!< in: key to search */
+/************************************************************************
+Search for the key, a node will be retuned in parent.last, whether it
+was found or not. If not found then parent.last will contain the
+parent node for the possibly new key otherwise the matching node.
+@return result of last comparison */
+UNIV_INTERN
+int
+rbt_search_cmp(
+/*===========*/
+ const ib_rbt_t* tree, /*!< in: rb tree */
+ ib_rbt_bound_t* parent, /*!< in: search bounds */
+ const void* key, /*!< in: key to search */
+ ib_rbt_compare compare); /*!< in: comparator */
+/************************************************************************
+Clear the tree, deletes (and free's) all the nodes. */
+UNIV_INTERN
+void
+rbt_clear(
+/*======*/
+ ib_rbt_t* tree); /*!< in: rb tree */
+/************************************************************************
+Merge the node from dst into src. Return the number of nodes merged.
+@return no. of recs merged */
+UNIV_INTERN
+ulint
+rbt_merge_uniq(
+/*===========*/
+ ib_rbt_t* dst, /*!< in: dst rb tree */
+ const ib_rbt_t* src); /*!< in: src rb tree */
+/************************************************************************
+Merge the node from dst into src. Return the number of nodes merged.
+Delete the nodes from src after copying node to dst. As a side effect
+the duplicates will be left untouched in the src, since we don't support
+duplicates (yet). NOTE: src and dst must be similar, the function doesn't
+check for this condition (yet).
+@return no. of recs merged */
+UNIV_INTERN
+ulint
+rbt_merge_uniq_destructive(
+/*=======================*/
+ ib_rbt_t* dst, /*!< in: dst rb tree */
+ ib_rbt_t* src); /*!< in: src rb tree */
+/************************************************************************
+Verify the integrity of the RB tree. For debugging. 0 failure else height
+of tree (in count of black nodes).
+@return TRUE if OK FALSE if tree invalid. */
+UNIV_INTERN
+ibool
+rbt_validate(
+/*=========*/
+ const ib_rbt_t* tree); /*!< in: tree to validate */
+/************************************************************************
+Iterate over the tree in depth first order. */
+UNIV_INTERN
+void
+rbt_print(
+/*======*/
+ const ib_rbt_t* tree, /*!< in: tree to traverse */
+ ib_rbt_print_node print); /*!< in: print function */
+
+#endif /* INNOBASE_UT0RBT_H */
diff --git a/storage/innobase/include/ut0rnd.ic b/storage/innobase/include/ut0rnd.ic
index 763469142ec..c3dbd86923c 100644
--- a/storage/innobase/include/ut0rnd.ic
+++ b/storage/innobase/include/ut0rnd.ic
@@ -152,6 +152,7 @@ ut_hash_ulint(
ulint key, /*!< in: value to be hashed */
ulint table_size) /*!< in: hash table size */
{
+ ut_ad(table_size);
key = key ^ UT_HASH_RANDOM_MASK2;
return(key % table_size);
diff --git a/storage/innobase/include/ut0ut.h b/storage/innobase/include/ut0ut.h
index 197b8401428..dd59b3eba46 100644
--- a/storage/innobase/include/ut0ut.h
+++ b/storage/innobase/include/ut0ut.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
Copyright (c) 2009, Sun Microsystems, Inc.
Portions of this file contain modifications contributed and copyrighted by
@@ -35,6 +35,8 @@ Created 1/20/1994 Heikki Tuuri
#include "univ.i"
+#include "db0err.h"
+
#ifndef UNIV_HOTBACKUP
# include "os0sync.h" /* for HAVE_ATOMIC_BUILTINS */
#endif /* UNIV_HOTBACKUP */
@@ -395,6 +397,16 @@ a limited buffer. */
# define ut_snprintf snprintf
#endif /* __WIN__ */
+/*************************************************************//**
+Convert an error number to a human readable text message. The
+returned string is static and should not be freed or modified.
+@return string, describing the error */
+UNIV_INTERN
+const char*
+ut_strerr(
+/*======*/
+ enum db_err num); /*!< in: error number */
+
#ifndef UNIV_NONINL
#include "ut0ut.ic"
#endif
diff --git a/storage/innobase/lock/lock0lock.c b/storage/innobase/lock/lock0lock.c
index 1fce8002bdf..d5fff572aee 100644
--- a/storage/innobase/lock/lock0lock.c
+++ b/storage/innobase/lock/lock0lock.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -376,6 +376,7 @@ UNIV_INTERN FILE* lock_latest_err_file;
/* Flags for recursive deadlock search */
#define LOCK_VICTIM_IS_START 1
#define LOCK_VICTIM_IS_OTHER 2
+#define LOCK_EXCEED_MAX_DEPTH 3
/********************************************************************//**
Checks if a lock request results in a deadlock.
@@ -394,24 +395,25 @@ Looks recursively for a deadlock.
deadlock and we chose 'start' as the victim, LOCK_VICTIM_IS_OTHER if a
deadlock was found and we chose some other trx as a victim: we must do
the search again in this last case because there may be another
-deadlock! */
+deadlock!
+LOCK_EXCEED_MAX_DEPTH if the lock search exceeds max steps or max depth. */
static
ulint
lock_deadlock_recursive(
/*====================*/
trx_t* start, /*!< in: recursion starting point */
trx_t* trx, /*!< in: a transaction waiting for a lock */
- lock_t* wait_lock, /*!< in: the lock trx is waiting to be granted */
+ lock_t* wait_lock, /*!< in: lock that is waiting to be granted */
ulint* cost, /*!< in/out: number of calculation steps thus
far: if this exceeds LOCK_MAX_N_STEPS_...
- we return LOCK_VICTIM_IS_START */
+ we return LOCK_EXCEED_MAX_DEPTH */
ulint depth); /*!< in: recursion depth: if this exceeds
LOCK_MAX_DEPTH_IN_DEADLOCK_CHECK, we
- return LOCK_VICTIM_IS_START */
+ return LOCK_EXCEED_MAX_DEPTH */
/*********************************************************************//**
Gets the nth bit of a record lock.
-@return TRUE if bit set */
+@return TRUE if bit set also if i == ULINT_UNDEFINED return FALSE*/
UNIV_INLINE
ibool
lock_rec_get_nth_bit(
@@ -1222,7 +1224,7 @@ lock_rec_get_first_on_page(
/*********************************************************************//**
Gets the next explicit lock request on a record.
-@return next lock, NULL if none exists */
+@return next lock, NULL if none exists or if heap_no == ULINT_UNDEFINED */
UNIV_INLINE
lock_t*
lock_rec_get_next(
@@ -3261,8 +3263,6 @@ lock_deadlock_occurs(
lock_t* lock, /*!< in: lock the transaction is requesting */
trx_t* trx) /*!< in: transaction */
{
- dict_table_t* table;
- dict_index_t* index;
trx_t* mark_trx;
ulint ret;
ulint cost = 0;
@@ -3284,31 +3284,50 @@ retry:
ret = lock_deadlock_recursive(trx, trx, lock, &cost, 0);
- if (ret == LOCK_VICTIM_IS_OTHER) {
+ switch (ret) {
+ case LOCK_VICTIM_IS_OTHER:
/* We chose some other trx as a victim: retry if there still
is a deadlock */
-
goto retry;
- }
- if (UNIV_UNLIKELY(ret == LOCK_VICTIM_IS_START)) {
- if (lock_get_type_low(lock) & LOCK_TABLE) {
- table = lock->un_member.tab_lock.table;
- index = NULL;
+ case LOCK_EXCEED_MAX_DEPTH:
+ /* If the lock search exceeds the max step
+ or the max depth, the current trx will be
+ the victim. Print its information. */
+ rewind(lock_latest_err_file);
+ ut_print_timestamp(lock_latest_err_file);
+
+ fputs("TOO DEEP OR LONG SEARCH IN THE LOCK TABLE"
+ " WAITS-FOR GRAPH, WE WILL ROLL BACK"
+ " FOLLOWING TRANSACTION \n",
+ lock_latest_err_file);
+
+ fputs("\n*** TRANSACTION:\n", lock_latest_err_file);
+ trx_print(lock_latest_err_file, trx, 3000);
+
+ fputs("*** WAITING FOR THIS LOCK TO BE GRANTED:\n",
+ lock_latest_err_file);
+
+ if (lock_get_type(lock) == LOCK_REC) {
+ lock_rec_print(lock_latest_err_file, lock);
} else {
- index = lock->index;
- table = index->table;
+ lock_table_print(lock_latest_err_file, lock);
}
+ break;
- lock_deadlock_found = TRUE;
-
+ case LOCK_VICTIM_IS_START:
fputs("*** WE ROLL BACK TRANSACTION (2)\n",
lock_latest_err_file);
+ break;
- return(TRUE);
+ default:
+ /* No deadlock detected*/
+ return(FALSE);
}
- return(FALSE);
+ lock_deadlock_found = TRUE;
+
+ return(TRUE);
}
/********************************************************************//**
@@ -3317,25 +3336,26 @@ Looks recursively for a deadlock.
deadlock and we chose 'start' as the victim, LOCK_VICTIM_IS_OTHER if a
deadlock was found and we chose some other trx as a victim: we must do
the search again in this last case because there may be another
-deadlock! */
+deadlock!
+LOCK_EXCEED_MAX_DEPTH if the lock search exceeds max steps or max depth. */
static
ulint
lock_deadlock_recursive(
/*====================*/
trx_t* start, /*!< in: recursion starting point */
trx_t* trx, /*!< in: a transaction waiting for a lock */
- lock_t* wait_lock, /*!< in: the lock trx is waiting to be granted */
+ lock_t* wait_lock, /*!< in: lock that is waiting to be granted */
ulint* cost, /*!< in/out: number of calculation steps thus
far: if this exceeds LOCK_MAX_N_STEPS_...
- we return LOCK_VICTIM_IS_START */
+ we return LOCK_EXCEED_MAX_DEPTH */
ulint depth) /*!< in: recursion depth: if this exceeds
LOCK_MAX_DEPTH_IN_DEADLOCK_CHECK, we
- return LOCK_VICTIM_IS_START */
+ return LOCK_EXCEED_MAX_DEPTH */
{
+ ulint ret;
lock_t* lock;
- ulint bit_no = ULINT_UNDEFINED;
trx_t* lock_trx;
- ulint ret;
+ ulint heap_no = ULINT_UNDEFINED;
ut_a(trx);
ut_a(start);
@@ -3351,27 +3371,44 @@ lock_deadlock_recursive(
*cost = *cost + 1;
- lock = wait_lock;
-
if (lock_get_type_low(wait_lock) == LOCK_REC) {
+ ulint space;
+ ulint page_no;
+
+ heap_no = lock_rec_find_set_bit(wait_lock);
+ ut_a(heap_no != ULINT_UNDEFINED);
- bit_no = lock_rec_find_set_bit(wait_lock);
+ space = wait_lock->un_member.rec_lock.space;
+ page_no = wait_lock->un_member.rec_lock.page_no;
- ut_a(bit_no != ULINT_UNDEFINED);
+ lock = lock_rec_get_first_on_page_addr(space, page_no);
+
+ /* Position the iterator on the first matching record lock. */
+ while (lock != NULL
+ && lock != wait_lock
+ && !lock_rec_get_nth_bit(lock, heap_no)) {
+
+ lock = lock_rec_get_next_on_page(lock);
+ }
+
+ if (lock == wait_lock) {
+ lock = NULL;
+ }
+
+ ut_ad(lock == NULL || lock_rec_get_nth_bit(lock, heap_no));
+
+ } else {
+ lock = wait_lock;
}
/* Look at the locks ahead of wait_lock in the lock queue */
for (;;) {
- if (lock_get_type_low(lock) & LOCK_TABLE) {
-
- lock = UT_LIST_GET_PREV(un_member.tab_lock.locks,
- lock);
- } else {
- ut_ad(lock_get_type_low(lock) == LOCK_REC);
- ut_a(bit_no != ULINT_UNDEFINED);
+ /* Get previous table lock. */
+ if (heap_no == ULINT_UNDEFINED) {
- lock = (lock_t*) lock_rec_get_prev(lock, bit_no);
+ lock = UT_LIST_GET_PREV(
+ un_member.tab_lock.locks, lock);
}
if (lock == NULL) {
@@ -3389,7 +3426,7 @@ lock_deadlock_recursive(
lock_trx = lock->trx;
- if (lock_trx == start || too_far) {
+ if (lock_trx == start) {
/* We came back to the recursion starting
point: a deadlock detected; or we have
@@ -3436,19 +3473,10 @@ lock_deadlock_recursive(
}
#ifdef UNIV_DEBUG
if (lock_print_waits) {
- fputs("Deadlock detected"
- " or too long search\n",
+ fputs("Deadlock detected\n",
stderr);
}
#endif /* UNIV_DEBUG */
- if (too_far) {
-
- fputs("TOO DEEP OR LONG SEARCH"
- " IN THE LOCK TABLE"
- " WAITS-FOR GRAPH\n", ef);
-
- return(LOCK_VICTIM_IS_START);
- }
if (trx_weight_cmp(wait_lock->trx,
start) >= 0) {
@@ -3484,6 +3512,21 @@ lock_deadlock_recursive(
return(LOCK_VICTIM_IS_OTHER);
}
+ if (too_far) {
+
+#ifdef UNIV_DEBUG
+ if (lock_print_waits) {
+ fputs("Deadlock search exceeds"
+ " max steps or depth.\n",
+ stderr);
+ }
+#endif /* UNIV_DEBUG */
+ /* The information about transaction/lock
+ to be rolled back is available in the top
+ level. Do not print anything here. */
+ return(LOCK_EXCEED_MAX_DEPTH);
+ }
+
if (lock_trx->que_state == TRX_QUE_LOCK_WAIT) {
/* Another trx ahead has requested lock in an
@@ -3493,12 +3536,28 @@ lock_deadlock_recursive(
ret = lock_deadlock_recursive(
start, lock_trx,
lock_trx->wait_lock, cost, depth + 1);
+
if (ret != 0) {
return(ret);
}
}
}
+ /* Get the next record lock to check. */
+ if (heap_no != ULINT_UNDEFINED) {
+
+ ut_a(lock != NULL);
+
+ do {
+ lock = lock_rec_get_next_on_page(lock);
+ } while (lock != NULL
+ && lock != wait_lock
+ && !lock_rec_get_nth_bit(lock, heap_no));
+
+ if (lock == wait_lock) {
+ lock = NULL;
+ }
+ }
}/* end of the 'for (;;)'-loop */
}
@@ -3694,9 +3753,10 @@ lock_table_enqueue_waiting(
/*********************************************************************//**
Checks if other transactions have an incompatible mode lock request in
-the lock queue. */
+the lock queue.
+@return lock or NULL */
UNIV_INLINE
-ibool
+lock_t*
lock_table_other_has_incompatible(
/*==============================*/
trx_t* trx, /*!< in: transaction, or NULL if all
@@ -3718,13 +3778,13 @@ lock_table_other_has_incompatible(
&& (!lock_mode_compatible(lock_get_mode(lock), mode))
&& (wait || !(lock_get_wait(lock)))) {
- return(TRUE);
+ return(lock);
}
lock = UT_LIST_GET_PREV(un_member.tab_lock.locks, lock);
}
- return(FALSE);
+ return(NULL);
}
/*********************************************************************//**
@@ -4249,28 +4309,29 @@ lock_rec_print(
block = buf_page_try_get(space, page_no, &mtr);
- if (block) {
- for (i = 0; i < lock_rec_get_n_bits(lock); i++) {
+ for (i = 0; i < lock_rec_get_n_bits(lock); ++i) {
+
+ if (!lock_rec_get_nth_bit(lock, i)) {
+ continue;
+ }
+
+ fprintf(file, "Record lock, heap no %lu", (ulong) i);
- if (lock_rec_get_nth_bit(lock, i)) {
+ if (block) {
+ const rec_t* rec;
- const rec_t* rec
- = page_find_rec_with_heap_no(
- buf_block_get_frame(block), i);
- offsets = rec_get_offsets(
- rec, lock->index, offsets,
- ULINT_UNDEFINED, &heap);
+ rec = page_find_rec_with_heap_no(
+ buf_block_get_frame(block), i);
- fprintf(file, "Record lock, heap no %lu ",
- (ulong) i);
- rec_print_new(file, rec, offsets);
- putc('\n', file);
- }
- }
- } else {
- for (i = 0; i < lock_rec_get_n_bits(lock); i++) {
- fprintf(file, "Record lock, heap no %lu\n", (ulong) i);
+ offsets = rec_get_offsets(
+ rec, lock->index, offsets,
+ ULINT_UNDEFINED, &heap);
+
+ putc(' ', file);
+ rec_print_new(file, rec, offsets);
}
+
+ putc('\n', file);
}
mtr_commit(&mtr);
@@ -4317,14 +4378,26 @@ lock_get_n_rec_locks(void)
#endif /* PRINT_NUM_OF_LOCK_STRUCTS */
/*********************************************************************//**
-Prints info of locks for all transactions. */
+Prints info of locks for all transactions.
+@return FALSE if not able to obtain kernel mutex
+and exits without printing info */
UNIV_INTERN
-void
+ibool
lock_print_info_summary(
/*====================*/
- FILE* file) /*!< in: file where to print */
+ FILE* file, /*!< in: file where to print */
+ ibool nowait) /*!< in: whether to wait for the kernel mutex */
{
- lock_mutex_enter_kernel();
+ /* if nowait is FALSE, wait on the kernel mutex,
+ otherwise return immediately if fail to obtain the
+ mutex. */
+ if (!nowait) {
+ lock_mutex_enter_kernel();
+ } else if (mutex_enter_nowait(&kernel_mutex)) {
+ fputs("FAIL TO OBTAIN KERNEL MUTEX, "
+ "SKIP LOCK INFO PRINTING\n", file);
+ return(FALSE);
+ }
if (lock_deadlock_found) {
fputs("------------------------\n"
@@ -4356,6 +4429,7 @@ lock_print_info_summary(
"Total number of lock structs in row lock hash table %lu\n",
(ulong) lock_get_n_rec_locks());
#endif /* PRINT_NUM_OF_LOCK_STRUCTS */
+ return(TRUE);
}
/*********************************************************************//**
@@ -4753,6 +4827,13 @@ loop:
|| lock->trx->conc_state == TRX_PREPARED
|| lock->trx->conc_state == TRX_COMMITTED_IN_MEMORY);
+# ifdef UNIV_SYNC_DEBUG
+ /* Only validate the record queues when this thread is not
+ holding a space->latch. Deadlocks are possible due to
+ latching order violation when UNIV_DEBUG is defined while
+ UNIV_SYNC_DEBUG is not. */
+ if (!sync_thread_levels_contains(SYNC_FSP))
+# endif /* UNIV_SYNC_DEBUG */
for (i = nth_bit; i < lock_rec_get_n_bits(lock); i++) {
if (i == 1 || lock_rec_get_nth_bit(lock, i)) {
@@ -4918,7 +4999,7 @@ lock_rec_insert_check_and_lock(
}
trx = thr_get_trx(thr);
- next_rec = page_rec_get_next((rec_t*) rec);
+ next_rec = page_rec_get_next_const(rec);
next_rec_heap_no = page_rec_get_heap_no(next_rec);
lock_mutex_enter_kernel();
diff --git a/storage/innobase/log/log0log.c b/storage/innobase/log/log0log.c
index d5b696074b3..04ced18bc69 100644
--- a/storage/innobase/log/log0log.c
+++ b/storage/innobase/log/log0log.c
@@ -1,23 +1,6 @@
/*****************************************************************************
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
-
-This program is free software; you can redistribute it and/or modify it under
-the terms of the GNU General Public License as published by the Free Software
-Foundation; version 2 of the License.
-
-This program is distributed in the hope that it will be useful, but WITHOUT
-ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
-
-*****************************************************************************/
-/*****************************************************************************
-
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
Copyright (c) 2009, Google Inc.
Portions of this file contain modifications contributed and copyrighted by
@@ -99,6 +82,17 @@ UNIV_INTERN ulint log_fsp_current_free_limit = 0;
/* Global log system variable */
UNIV_INTERN log_t* log_sys = NULL;
+#ifdef UNIV_PFS_RWLOCK
+UNIV_INTERN mysql_pfs_key_t checkpoint_lock_key;
+# ifdef UNIV_LOG_ARCHIVE
+UNIV_INTERN mysql_pfs_key_t archive_lock_key;
+# endif
+#endif /* UNIV_PFS_RWLOCK */
+
+#ifdef UNIV_PFS_MUTEX
+UNIV_INTERN mysql_pfs_key_t log_sys_mutex_key;
+#endif /* UNIV_PFS_MUTEX */
+
#ifdef UNIV_DEBUG
UNIV_INTERN ibool log_do_write = TRUE;
#endif /* UNIV_DEBUG */
@@ -773,7 +767,7 @@ log_init(void)
{
log_sys = mem_alloc(sizeof(log_t));
- mutex_create(&log_sys->mutex, SYNC_LOG);
+ mutex_create(log_sys_mutex_key, &log_sys->mutex, SYNC_LOG);
mutex_enter(&(log_sys->mutex));
@@ -829,7 +823,8 @@ log_init(void)
log_sys->last_checkpoint_lsn = log_sys->lsn;
log_sys->n_pending_checkpoint_writes = 0;
- rw_lock_create(&log_sys->checkpoint_lock, SYNC_NO_ORDER_CHECK);
+ rw_lock_create(checkpoint_lock_key, &log_sys->checkpoint_lock,
+ SYNC_NO_ORDER_CHECK);
log_sys->checkpoint_buf_ptr = mem_alloc(2 * OS_FILE_LOG_BLOCK_SIZE);
log_sys->checkpoint_buf = ut_align(log_sys->checkpoint_buf_ptr,
@@ -845,7 +840,8 @@ log_init(void)
log_sys->n_pending_archive_ios = 0;
- rw_lock_create(&log_sys->archive_lock, SYNC_NO_ORDER_CHECK);
+ rw_lock_create(archive_lock_key, &log_sys->archive_lock,
+ SYNC_NO_ORDER_CHECK);
log_sys->archive_buf = NULL;
@@ -2013,7 +2009,7 @@ log_checkpoint(
return(TRUE);
}
- ut_ad(log_sys->written_to_all_lsn >= oldest_lsn);
+ ut_ad(log_sys->flushed_to_disk_lsn >= oldest_lsn);
if (log_sys->n_pending_checkpoint_writes > 0) {
/* A checkpoint write is running */
@@ -2371,13 +2367,15 @@ loop:
log_archived_file_name_gen(name, group->id,
group->archived_file_no + n_files);
- file_handle = os_file_create(name, open_mode, OS_FILE_AIO,
+ file_handle = os_file_create(innodb_file_log_key,
+ name, open_mode,
+ OS_FILE_AIO,
OS_DATA_FILE, &ret);
if (!ret && (open_mode == OS_FILE_CREATE)) {
file_handle = os_file_create(
- name, OS_FILE_OPEN, OS_FILE_AIO,
- OS_DATA_FILE, &ret);
+ innodb_file_log_key, name, OS_FILE_OPEN,
+ OS_FILE_AIO, OS_DATA_FILE, &ret);
}
if (!ret) {
@@ -3095,7 +3093,7 @@ loop:
if (srv_fast_shutdown < 2
&& (srv_error_monitor_active
- || srv_lock_timeout_and_monitor_active)) {
+ || srv_lock_timeout_active || srv_monitor_active)) {
mutex_exit(&kernel_mutex);
diff --git a/storage/innobase/log/log0recv.c b/storage/innobase/log/log0recv.c
index ddbc71d4b71..0e96dbbb960 100644
--- a/storage/innobase/log/log0recv.c
+++ b/storage/innobase/log/log0recv.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -138,7 +138,9 @@ UNIV_INTERN ulint recv_max_parsed_page_no;
/** This many frames must be left free in the buffer pool when we scan
the log and store the scanned log records in the buffer pool: we will
use these free frames to read in pages when we start applying the
-log records to the database. */
+log records to the database.
+This is the default value. If the actual size of the buffer pool is
+larger than 10 MB we'll set this value to 512. */
UNIV_INTERN ulint recv_n_pool_free_frames;
/** The maximum lsn we see for a page during the recovery process. If this
@@ -146,6 +148,14 @@ is bigger than the lsn we are able to scan up to, that is an indication that
the recovery failed and the database may be corrupt. */
UNIV_INTERN ib_uint64_t recv_max_page_lsn;
+#ifdef UNIV_PFS_THREAD
+UNIV_INTERN mysql_pfs_key_t trx_rollback_clean_thread_key;
+#endif /* UNIV_PFS_THREAD */
+
+#ifdef UNIV_PFS_MUTEX
+UNIV_INTERN mysql_pfs_key_t recv_sys_mutex_key;
+#endif /* UNIV_PFS_MUTEX */
+
/* prototypes */
#ifndef UNIV_HOTBACKUP
@@ -173,7 +183,7 @@ recv_sys_create(void)
recv_sys = mem_alloc(sizeof(*recv_sys));
memset(recv_sys, 0x0, sizeof(*recv_sys));
- mutex_create(&recv_sys->mutex, SYNC_RECV);
+ mutex_create(recv_sys_mutex_key, &recv_sys->mutex, SYNC_RECV);
recv_sys->heap = NULL;
recv_sys->addr_hash = NULL;
@@ -239,6 +249,7 @@ recv_sys_mem_free(void)
}
}
+#ifndef UNIV_HOTBACKUP
/************************************************************
Reset the state of the recovery system variables. */
UNIV_INTERN
@@ -278,6 +289,7 @@ recv_sys_var_init(void)
recv_max_page_lsn = 0;
}
+#endif /* !UNIV_HOTBACKUP */
/************************************************************
Inits the recovery system for a recovery operation. */
@@ -292,6 +304,14 @@ recv_sys_init(
return;
}
+#ifndef UNIV_HOTBACKUP
+ /* Initialize red-black tree for fast insertions into the
+ flush_list during recovery process.
+ As this initialization is done while holding the buffer pool
+ mutex we perform it before acquiring recv_sys->mutex. */
+ buf_flush_init_flush_rbt();
+#endif /* !UNIV_HOTBACKUP */
+
mutex_enter(&(recv_sys->mutex));
#ifndef UNIV_HOTBACKUP
@@ -301,6 +321,12 @@ recv_sys_init(
recv_is_from_backup = TRUE;
#endif /* !UNIV_HOTBACKUP */
+ /* Set appropriate value of recv_n_pool_free_frames. */
+ if (buf_pool_get_curr_size() >= (10 * 1024 * 1024)) {
+ /* Buffer pool of size greater than 10 MB. */
+ recv_n_pool_free_frames = 512;
+ }
+
recv_sys->buf = ut_malloc(RECV_PARSING_BUF_SIZE);
recv_sys->len = 0;
recv_sys->recovered_offset = 0;
@@ -370,6 +396,9 @@ recv_sys_debug_free(void)
recv_sys->last_block_buf_start = NULL;
mutex_exit(&(recv_sys->mutex));
+
+ /* Free up the flush_rbt. */
+ buf_flush_free_flush_rbt();
}
# endif /* UNIV_LOG_DEBUG */
@@ -1632,7 +1661,9 @@ recv_recover_page_func(
if (modification_to_page) {
ut_a(block);
+ buf_flush_order_mutex_enter();
buf_flush_recv_note_modification(block, start_lsn, end_lsn);
+ buf_flush_order_mutex_exit();
}
#endif /* !UNIV_HOTBACKUP */
@@ -2050,15 +2081,6 @@ recv_parse_log_rec(
}
#endif /* UNIV_LOG_LSN_DEBUG */
- /* Check that page_no is sensible */
-
- if (UNIV_UNLIKELY(*page_no > 0x8FFFFFFFUL)) {
-
- recv_sys->found_corrupt_log = TRUE;
-
- return(0);
- }
-
new_ptr = recv_parse_or_apply_log_rec_body(*type, new_ptr, end_ptr,
NULL, NULL);
if (UNIV_UNLIKELY(new_ptr == NULL)) {
@@ -2167,6 +2189,14 @@ recv_report_corrupt_log(
putc('\n', stderr);
}
+#ifndef UNIV_HOTBACKUP
+ if (!srv_force_recovery) {
+ fputs("InnoDB: Set innodb_force_recovery"
+ " to ignore this error.\n", stderr);
+ ut_error;
+ }
+#endif /* !UNIV_HOTBACKUP */
+
fputs("InnoDB: WARNING: the log file may have been corrupt and it\n"
"InnoDB: is possible that the log scan did not proceed\n"
"InnoDB: far enough in recovery! Please run CHECK TABLE\n"
@@ -2556,7 +2586,7 @@ recv_scan_log_recs(
ut_ad(start_lsn % OS_FILE_LOG_BLOCK_SIZE == 0);
ut_ad(len % OS_FILE_LOG_BLOCK_SIZE == 0);
- ut_ad(len > 0);
+ ut_ad(len >= OS_FILE_LOG_BLOCK_SIZE);
ut_a(store_to_hash <= TRUE);
finished = FALSE;
@@ -2681,6 +2711,16 @@ recv_scan_log_recs(
recv_sys->found_corrupt_log = TRUE;
+#ifndef UNIV_HOTBACKUP
+ if (!srv_force_recovery) {
+ fputs("InnoDB: Set"
+ " innodb_force_recovery"
+ " to ignore this error.\n",
+ stderr);
+ ut_error;
+ }
+#endif /* !UNIV_HOTBACKUP */
+
} else if (!recv_sys->found_corrupt_log) {
more_data = recv_sys_add_to_parsing_buf(
log_block, scanned_lsn);
@@ -3210,8 +3250,6 @@ void
recv_recovery_from_checkpoint_finish(void)
/*======================================*/
{
- int i;
-
/* Apply the hashed log records to the respective file pages */
if (srv_force_recovery < SRV_FORCE_NO_LOG_REDO) {
@@ -3259,9 +3297,16 @@ recv_recovery_from_checkpoint_finish(void)
The data dictionary latch should guarantee that there is at
most one data dictionary transaction active at a time. */
trx_rollback_or_clean_recovered(FALSE);
+}
- /* Drop partially created indexes. */
- row_merge_drop_temp_indexes();
+/********************************************************//**
+Initiates the rollback of active transactions. */
+UNIV_INTERN
+void
+recv_recovery_rollback_active(void)
+/*===============================*/
+{
+ int i;
#ifdef UNIV_SYNC_DEBUG
/* Wait for a while so that created threads have time to suspend
@@ -3271,6 +3316,11 @@ recv_recovery_from_checkpoint_finish(void)
/* Switch latching order checks on in sync0sync.c */
sync_order_checks_on = TRUE;
#endif
+ /* Drop partially created indexes. */
+ row_merge_drop_temp_indexes();
+ /* Drop temporary tables. */
+ row_mysql_drop_temp_tables();
+
if (srv_force_recovery < SRV_FORCE_NO_TRX_UNDO) {
/* Rollback the uncommitted transactions which have no user
session */
@@ -3386,8 +3436,10 @@ recv_reset_log_files_for_backup(
sprintf(name, "%s%s%lu", log_dir,
ib_logfile_basename, (ulong)i);
- log_file = os_file_create_simple(name, OS_FILE_CREATE,
- OS_FILE_READ_WRITE, &success);
+ log_file = os_file_create_simple(innodb_file_log_key,
+ name, OS_FILE_CREATE,
+ OS_FILE_READ_WRITE,
+ &success);
if (!success) {
fprintf(stderr,
"InnoDB: Cannot create %s. Check that"
@@ -3426,7 +3478,8 @@ recv_reset_log_files_for_backup(
LOG_BLOCK_HDR_SIZE);
sprintf(name, "%s%s%lu", log_dir, ib_logfile_basename, (ulong)0);
- log_file = os_file_create_simple(name, OS_FILE_OPEN,
+ log_file = os_file_create_simple(innodb_file_log_key,
+ name, OS_FILE_OPEN,
OS_FILE_READ_WRITE, &success);
if (!success) {
fprintf(stderr, "InnoDB: Cannot open %s.\n", name);
@@ -3476,7 +3529,8 @@ try_open_again:
log_archived_file_name_gen(name, group->id, group->archived_file_no);
- file_handle = os_file_create(name, OS_FILE_OPEN,
+ file_handle = os_file_create(innodb_file_log_key,
+ name, OS_FILE_OPEN,
OS_FILE_LOG, OS_FILE_AIO, &ret);
if (ret == FALSE) {
diff --git a/storage/innobase/mem/mem0dbg.c b/storage/innobase/mem/mem0dbg.c
index 01eda20ec45..d91e610a08a 100644
--- a/storage/innobase/mem/mem0dbg.c
+++ b/storage/innobase/mem/mem0dbg.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -29,7 +29,13 @@ Created 6/9/1994 Heikki Tuuri
/* The mutex which protects in the debug version the hash table
containing the list of live memory heaps, and also the global
variables below. */
-UNIV_INTERN mutex_t mem_hash_mutex;
+UNIV_INTERN mutex_t mem_hash_mutex;
+
+#ifdef UNIV_PFS_MUTEX
+/* Key to register mem_hash_mutex with performance schema */
+UNIV_INTERN mysql_pfs_key_t mem_hash_mutex_key;
+#endif /* UNIV_PFS_MUTEX */
+
# endif /* !UNIV_HOTBACKUP */
/* The following variables contain information about the
@@ -149,7 +155,7 @@ mem_init(
/* Initialize the hash table */
ut_a(FALSE == mem_hash_initialized);
- mutex_create(&mem_hash_mutex, SYNC_MEM_HASH);
+ mutex_create(mem_hash_mutex_key, &mem_hash_mutex, SYNC_MEM_HASH);
for (i = 0; i < MEM_HASH_SIZE; i++) {
UT_LIST_INIT(*mem_hash_get_nth_cell(i));
@@ -180,6 +186,10 @@ mem_close(void)
{
mem_pool_free(mem_comm_pool);
mem_comm_pool = NULL;
+#ifdef UNIV_MEM_DEBUG
+ mutex_free(&mem_hash_mutex);
+ mem_hash_initialized = FALSE;
+#endif /* UNIV_MEM_DEBUG */
}
#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/mem/mem0mem.c b/storage/innobase/mem/mem0mem.c
index ccb2fd8a7b4..c0ce8a3e1ac 100644
--- a/storage/innobase/mem/mem0mem.c
+++ b/storage/innobase/mem/mem0mem.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -383,6 +383,20 @@ mem_heap_create_block(
mem_block_set_free(block, MEM_BLOCK_HEADER_SIZE);
mem_block_set_start(block, MEM_BLOCK_HEADER_SIZE);
+ if (UNIV_UNLIKELY(heap == NULL)) {
+ /* This is the first block of the heap. The field
+ total_size should be initialized here */
+ block->total_size = len;
+ } else {
+ /* Not the first allocation for the heap. This block's
+ total_length field should be set to undefined. */
+ ut_d(block->total_size = ULINT_UNDEFINED);
+ UNIV_MEM_INVALID(&block->total_size,
+ sizeof block->total_size);
+
+ heap->total_size += len;
+ }
+
ut_ad((ulint)MEM_BLOCK_HEADER_SIZE < len);
return(block);
@@ -471,6 +485,10 @@ mem_heap_block_free(
mem_pool_mutex_exit();
#endif
+
+ ut_ad(heap->total_size >= block->len);
+ heap->total_size -= block->len;
+
type = heap->type;
len = block->len;
block->magic_n = MEM_FREED_BLOCK_MAGIC_N;
diff --git a/storage/innobase/mem/mem0pool.c b/storage/innobase/mem/mem0pool.c
index c4f8af607e0..cb33e788bee 100644
--- a/storage/innobase/mem/mem0pool.c
+++ b/storage/innobase/mem/mem0pool.c
@@ -114,6 +114,11 @@ struct mem_pool_struct{
/** The common memory pool */
UNIV_INTERN mem_pool_t* mem_comm_pool = NULL;
+#ifdef UNIV_PFS_MUTEX
+/* Key to register mutex in mem_pool_struct with performance schema */
+UNIV_INTERN mysql_pfs_key_t mem_pool_mutex_key;
+#endif /* UNIV_PFS_MUTEX */
+
/* We use this counter to check that the mem pool mutex does not leak;
this is to track a strange assertion failure reported at
mysql@lists.mysql.com */
@@ -219,7 +224,7 @@ mem_pool_create(
pool->buf = ut_malloc_low(size, FALSE, TRUE);
pool->size = size;
- mutex_create(&pool->mutex, SYNC_MEM_POOL);
+ mutex_create(mem_pool_mutex_key, &pool->mutex, SYNC_MEM_POOL);
/* Initialize the free lists */
diff --git a/storage/innobase/mtr/mtr0mtr.c b/storage/innobase/mtr/mtr0mtr.c
index 417e97732bb..78618564ef1 100644
--- a/storage/innobase/mtr/mtr0mtr.c
+++ b/storage/innobase/mtr/mtr0mtr.c
@@ -30,6 +30,7 @@ Created 11/26/1995 Heikki Tuuri
#endif
#include "buf0buf.h"
+#include "buf0flu.h"
#include "page0types.h"
#include "mtr0log.h"
#include "log0log.h"
@@ -38,7 +39,7 @@ Created 11/26/1995 Heikki Tuuri
# include "log0recv.h"
/*****************************************************************//**
Releases the item in the slot given. */
-UNIV_INLINE
+static
void
mtr_memo_slot_release(
/*==================*/
@@ -48,14 +49,19 @@ mtr_memo_slot_release(
void* object;
ulint type;
- ut_ad(mtr && slot);
+ ut_ad(mtr);
+ ut_ad(slot);
+
+#ifndef UNIV_DEBUG
+ UT_NOT_USED(mtr);
+#endif /* UNIV_DEBUG */
object = slot->object;
type = slot->type;
if (UNIV_LIKELY(object != NULL)) {
if (type <= MTR_MEMO_BUF_FIX) {
- buf_page_release((buf_block_t*)object, type, mtr);
+ buf_page_release((buf_block_t*)object, type);
} else if (type == MTR_MEMO_S_LOCK) {
rw_lock_s_unlock((rw_lock_t*)object);
#ifdef UNIV_DEBUG
@@ -73,13 +79,10 @@ mtr_memo_slot_release(
}
/**********************************************************//**
-Releases the mlocks and other objects stored in an mtr memo. They are released
-in the order opposite to which they were pushed to the memo. NOTE! It is
-essential that the x-rw-lock on a modified buffer page is not released before
-buf_page_note_modification is called for that page! Otherwise, some thread
-might race to modify it, and the flush list sort order on lsn would be
-destroyed. */
-UNIV_INLINE
+Releases the mlocks and other objects stored in an mtr memo.
+They are released in the order opposite to which they were pushed
+to the memo. */
+static
void
mtr_memo_pop_all(
/*=============*/
@@ -105,6 +108,59 @@ mtr_memo_pop_all(
}
}
+/*****************************************************************//**
+Releases the item in the slot given. */
+static
+void
+mtr_memo_slot_note_modification(
+/*============================*/
+ mtr_t* mtr, /*!< in: mtr */
+ mtr_memo_slot_t* slot) /*!< in: memo slot */
+{
+ ut_ad(mtr);
+ ut_ad(mtr->magic_n == MTR_MAGIC_N);
+ ut_ad(mtr->modifications);
+ ut_ad(buf_flush_order_mutex_own());
+
+ if (slot->object != NULL && slot->type == MTR_MEMO_PAGE_X_FIX) {
+ buf_flush_note_modification((buf_block_t*) slot->object, mtr);
+ }
+}
+
+/**********************************************************//**
+Add the modified pages to the buffer flush list. They are released
+in the order opposite to which they were pushed to the memo. NOTE! It is
+essential that the x-rw-lock on a modified buffer page is not released
+before buf_page_note_modification is called for that page! Otherwise,
+some thread might race to modify it, and the flush list sort order on
+lsn would be destroyed. */
+static
+void
+mtr_memo_note_modifications(
+/*========================*/
+ mtr_t* mtr) /*!< in: mtr */
+{
+ dyn_array_t* memo;
+ ulint offset;
+
+ ut_ad(mtr);
+ ut_ad(mtr->magic_n == MTR_MAGIC_N);
+ ut_ad(mtr->state == MTR_COMMITTING); /* Currently only used in
+ commit */
+ memo = &mtr->memo;
+
+ offset = dyn_array_get_data_size(memo);
+
+ while (offset > 0) {
+ mtr_memo_slot_t* slot;
+
+ offset -= sizeof(mtr_memo_slot_t);
+ slot = dyn_array_get_element(memo, offset);
+
+ mtr_memo_slot_note_modification(mtr, slot);
+ }
+}
+
/************************************************************//**
Writes the contents of a mini-transaction log, if any, to the database log. */
static
@@ -137,7 +193,9 @@ mtr_log_reserve_and_write(
&mtr->start_lsn);
if (mtr->end_lsn) {
- return;
+ /* Success. We have the log mutex.
+ Add pages to flush list and exit */
+ goto func_exit;
}
}
@@ -161,6 +219,18 @@ mtr_log_reserve_and_write(
}
mtr->end_lsn = log_close();
+
+func_exit:
+ buf_flush_order_mutex_enter();
+
+ /* It is now safe to release the log mutex because the
+ flush_order mutex will ensure that we are the first one
+ to insert into the flush list. */
+ log_release();
+ if (mtr->modifications) {
+ mtr_memo_note_modifications(mtr);
+ }
+ buf_flush_order_mutex_exit();
}
#endif /* !UNIV_HOTBACKUP */
@@ -172,10 +242,6 @@ mtr_commit(
/*=======*/
mtr_t* mtr) /*!< in: mini-transaction */
{
-#ifndef UNIV_HOTBACKUP
- ibool write_log;
-#endif /* !UNIV_HOTBACKUP */
-
ut_ad(mtr);
ut_ad(mtr->magic_n == MTR_MAGIC_N);
ut_ad(mtr->state == MTR_ACTIVE);
@@ -184,25 +250,12 @@ mtr_commit(
#ifndef UNIV_HOTBACKUP
/* This is a dirty read, for debugging. */
ut_ad(!recv_no_log_write);
- write_log = mtr->modifications && mtr->n_log_recs;
- if (write_log) {
+ if (mtr->modifications && mtr->n_log_recs) {
mtr_log_reserve_and_write(mtr);
}
- /* We first update the modification info to buffer pages, and only
- after that release the log mutex: this guarantees that when the log
- mutex is free, all buffer pages contain an up-to-date info of their
- modifications. This fact is used in making a checkpoint when we look
- at the oldest modification of any page in the buffer pool. It is also
- required when we insert modified buffer pages in to the flush list
- which must be sorted on oldest_modification. */
-
mtr_memo_pop_all(mtr);
-
- if (write_log) {
- log_release();
- }
#endif /* !UNIV_HOTBACKUP */
ut_d(mtr->state = MTR_COMMITTED);
@@ -241,6 +294,10 @@ mtr_rollback_to_savepoint(
slot = dyn_array_get_element(memo, offset);
ut_ad(slot->type != MTR_MEMO_MODIFY);
+
+ /* We do not call mtr_memo_slot_note_modification()
+ because there MUST be no changes made to the buffer
+ pages after the savepoint */
mtr_memo_slot_release(mtr, slot);
}
}
@@ -267,18 +324,23 @@ mtr_memo_release(
offset = dyn_array_get_data_size(memo);
+ buf_flush_order_mutex_enter();
while (offset > 0) {
offset -= sizeof(mtr_memo_slot_t);
slot = dyn_array_get_element(memo, offset);
- if ((object == slot->object) && (type == slot->type)) {
+ if (object == slot->object && type == slot->type) {
+ if (mtr->modifications) {
+ mtr_memo_slot_note_modification(mtr, slot);
+ }
mtr_memo_slot_release(mtr, slot);
break;
}
}
+ buf_flush_order_mutex_exit();
}
#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/mysql-test/innodb-autoinc-44030.result b/storage/innobase/mysql-test/innodb-autoinc-44030.result
new file mode 100644
index 00000000000..c0695bf0be0
--- /dev/null
+++ b/storage/innobase/mysql-test/innodb-autoinc-44030.result
@@ -0,0 +1,30 @@
+drop table if exists t1;
+SET @@SESSION.AUTO_INCREMENT_INCREMENT=1, @@SESSION.AUTO_INCREMENT_OFFSET=1;
+CREATE TABLE t1 (c1 INT PRIMARY KEY AUTO_INCREMENT) ENGINE=InnoDB;
+INSERT INTO t1 VALUES (null);
+INSERT INTO t1 VALUES (null);
+ALTER TABLE t1 CHANGE c1 d1 INT NOT NULL AUTO_INCREMENT;
+SELECT * FROM t1;
+d1
+1
+2
+SELECT * FROM t1;
+d1
+1
+2
+INSERT INTO t1 VALUES(null);
+Got one of the listed errors
+ALTER TABLE t1 AUTO_INCREMENT = 3;
+SHOW CREATE TABLE t1;
+Table Create Table
+t1 CREATE TABLE `t1` (
+ `d1` int(11) NOT NULL AUTO_INCREMENT,
+ PRIMARY KEY (`d1`)
+) ENGINE=InnoDB AUTO_INCREMENT=3 DEFAULT CHARSET=latin1
+INSERT INTO t1 VALUES(null);
+SELECT * FROM t1;
+d1
+1
+2
+3
+DROP TABLE t1;
diff --git a/storage/innobase/mysql-test/innodb-autoinc-44030.test b/storage/innobase/mysql-test/innodb-autoinc-44030.test
new file mode 100644
index 00000000000..af2e3015280
--- /dev/null
+++ b/storage/innobase/mysql-test/innodb-autoinc-44030.test
@@ -0,0 +1,34 @@
+-- source include/have_innodb.inc
+# embedded server ignores 'delayed', so skip this
+-- source include/not_embedded.inc
+
+--disable_warnings
+drop table if exists t1;
+--enable_warnings
+
+#
+# 44030: Error: (1500) Couldn't read the MAX(ID) autoinc value from
+# the index (PRIMARY)
+# This test requires a restart of the server
+SET @@SESSION.AUTO_INCREMENT_INCREMENT=1, @@SESSION.AUTO_INCREMENT_OFFSET=1;
+CREATE TABLE t1 (c1 INT PRIMARY KEY AUTO_INCREMENT) ENGINE=InnoDB;
+INSERT INTO t1 VALUES (null);
+INSERT INTO t1 VALUES (null);
+ALTER TABLE t1 CHANGE c1 d1 INT NOT NULL AUTO_INCREMENT;
+SELECT * FROM t1;
+# Restart the server
+-- source include/restart_mysqld.inc
+# The MySQL and InnoDB data dictionaries should now be out of sync.
+# The select should print message to the error log
+SELECT * FROM t1;
+# MySQL have made a change (http://lists.mysql.com/commits/75268) that no
+# longer results in the two data dictionaries being out of sync. If they
+# revert their changes then this check for ER_AUTOINC_READ_FAILED will need
+# to be enabled. Also, see http://bugs.mysql.com/bug.php?id=47621.
+-- error ER_AUTOINC_READ_FAILED,1467
+INSERT INTO t1 VALUES(null);
+ALTER TABLE t1 AUTO_INCREMENT = 3;
+SHOW CREATE TABLE t1;
+INSERT INTO t1 VALUES(null);
+SELECT * FROM t1;
+DROP TABLE t1;
diff --git a/storage/innobase/mysql-test/innodb-autoinc.result b/storage/innobase/mysql-test/innodb-autoinc.result
index d2e8eb19e0c..a36b3a1a865 100644
--- a/storage/innobase/mysql-test/innodb-autoinc.result
+++ b/storage/innobase/mysql-test/innodb-autoinc.result
@@ -867,25 +867,380 @@ INSERT INTO t2 SELECT NULL FROM t1;
Got one of the listed errors
DROP TABLE t1;
DROP TABLE t2;
-CREATE TABLE t1 (c1 INT PRIMARY KEY AUTO_INCREMENT) ENGINE=InnoDB;
-INSERT INTO t1 VALUES (null);
-INSERT INTO t1 VALUES (null);
-ALTER TABLE t1 CHANGE c1 d1 INT NOT NULL AUTO_INCREMENT;
+SET @@SESSION.AUTO_INCREMENT_INCREMENT=1, @@SESSION.AUTO_INCREMENT_OFFSET=1;
+SHOW VARIABLES LIKE "%auto_inc%";
+Variable_name Value
+auto_increment_increment 1
+auto_increment_offset 1
+CREATE TABLE t1 (c1 TINYINT PRIMARY KEY AUTO_INCREMENT, c2 VARCHAR(10)) ENGINE=InnoDB;
+INSERT INTO t1 VALUES (1, NULL);
+INSERT INTO t1 VALUES (-1, 'innodb');
+INSERT INTO t1 VALUES (-127, 'innodb');
+INSERT INTO t1 VALUES (NULL, NULL);
+SHOW CREATE TABLE t1;
+Table Create Table
+t1 CREATE TABLE `t1` (
+ `c1` tinyint(4) NOT NULL AUTO_INCREMENT,
+ `c2` varchar(10) DEFAULT NULL,
+ PRIMARY KEY (`c1`)
+) ENGINE=InnoDB AUTO_INCREMENT=3 DEFAULT CHARSET=latin1
+SELECT * FROM t1;
+c1 c2
+-127 innodb
+-1 innodb
+1 NULL
+2 NULL
+DROP TABLE t1;
+CREATE TABLE t1 (c1 TINYINT UNSIGNED PRIMARY KEY AUTO_INCREMENT, c2 VARCHAR(10)) ENGINE=InnoDB;
+INSERT INTO t1 VALUES (1, NULL);
+INSERT INTO t1 VALUES (-1, 'innodb');
+Warnings:
+Warning 1264 Out of range value for column 'c1' at row 1
+INSERT INTO t1 VALUES (-127, 'innodb');
+Warnings:
+Warning 1264 Out of range value for column 'c1' at row 1
+INSERT INTO t1 VALUES (NULL, NULL);
+SHOW CREATE TABLE t1;
+Table Create Table
+t1 CREATE TABLE `t1` (
+ `c1` tinyint(3) unsigned NOT NULL AUTO_INCREMENT,
+ `c2` varchar(10) DEFAULT NULL,
+ PRIMARY KEY (`c1`)
+) ENGINE=InnoDB AUTO_INCREMENT=5 DEFAULT CHARSET=latin1
+SELECT * FROM t1;
+c1 c2
+1 NULL
+2 innodb
+3 innodb
+4 NULL
+DROP TABLE t1;
+CREATE TABLE t1 (c1 SMALLINT PRIMARY KEY AUTO_INCREMENT, c2 VARCHAR(10)) ENGINE=InnoDB;
+INSERT INTO t1 VALUES (1, NULL);
+INSERT INTO t1 VALUES (-1, 'innodb');
+INSERT INTO t1 VALUES (-32767, 'innodb');
+INSERT INTO t1 VALUES (NULL, NULL);
+SHOW CREATE TABLE t1;
+Table Create Table
+t1 CREATE TABLE `t1` (
+ `c1` smallint(6) NOT NULL AUTO_INCREMENT,
+ `c2` varchar(10) DEFAULT NULL,
+ PRIMARY KEY (`c1`)
+) ENGINE=InnoDB AUTO_INCREMENT=3 DEFAULT CHARSET=latin1
+SELECT * FROM t1;
+c1 c2
+-32767 innodb
+-1 innodb
+1 NULL
+2 NULL
+DROP TABLE t1;
+CREATE TABLE t1 (c1 SMALLINT UNSIGNED PRIMARY KEY AUTO_INCREMENT, c2 VARCHAR(10)) ENGINE=InnoDB;
+INSERT INTO t1 VALUES (1, NULL);
+INSERT INTO t1 VALUES (-1, 'innodb');
+Warnings:
+Warning 1264 Out of range value for column 'c1' at row 1
+INSERT INTO t1 VALUES (-32757, 'innodb');
+Warnings:
+Warning 1264 Out of range value for column 'c1' at row 1
+INSERT INTO t1 VALUES (NULL, NULL);
+SHOW CREATE TABLE t1;
+Table Create Table
+t1 CREATE TABLE `t1` (
+ `c1` smallint(5) unsigned NOT NULL AUTO_INCREMENT,
+ `c2` varchar(10) DEFAULT NULL,
+ PRIMARY KEY (`c1`)
+) ENGINE=InnoDB AUTO_INCREMENT=5 DEFAULT CHARSET=latin1
+SELECT * FROM t1;
+c1 c2
+1 NULL
+2 innodb
+3 innodb
+4 NULL
+DROP TABLE t1;
+CREATE TABLE t1 (c1 MEDIUMINT PRIMARY KEY AUTO_INCREMENT, c2 VARCHAR(10)) ENGINE=InnoDB;
+INSERT INTO t1 VALUES (1, NULL);
+INSERT INTO t1 VALUES (-1, 'innodb');
+INSERT INTO t1 VALUES (-8388607, 'innodb');
+INSERT INTO t1 VALUES (NULL, NULL);
+SHOW CREATE TABLE t1;
+Table Create Table
+t1 CREATE TABLE `t1` (
+ `c1` mediumint(9) NOT NULL AUTO_INCREMENT,
+ `c2` varchar(10) DEFAULT NULL,
+ PRIMARY KEY (`c1`)
+) ENGINE=InnoDB AUTO_INCREMENT=3 DEFAULT CHARSET=latin1
+SELECT * FROM t1;
+c1 c2
+-8388607 innodb
+-1 innodb
+1 NULL
+2 NULL
+DROP TABLE t1;
+CREATE TABLE t1 (c1 MEDIUMINT UNSIGNED PRIMARY KEY AUTO_INCREMENT, c2 VARCHAR(10)) ENGINE=InnoDB;
+INSERT INTO t1 VALUES (1, NULL);
+INSERT INTO t1 VALUES (-1, 'innodb');
+Warnings:
+Warning 1264 Out of range value for column 'c1' at row 1
+INSERT INTO t1 VALUES (-8388607, 'innodb');
+Warnings:
+Warning 1264 Out of range value for column 'c1' at row 1
+INSERT INTO t1 VALUES (NULL, NULL);
+SHOW CREATE TABLE t1;
+Table Create Table
+t1 CREATE TABLE `t1` (
+ `c1` mediumint(8) unsigned NOT NULL AUTO_INCREMENT,
+ `c2` varchar(10) DEFAULT NULL,
+ PRIMARY KEY (`c1`)
+) ENGINE=InnoDB AUTO_INCREMENT=5 DEFAULT CHARSET=latin1
SELECT * FROM t1;
-d1
+c1 c2
+1 NULL
+2 innodb
+3 innodb
+4 NULL
+DROP TABLE t1;
+CREATE TABLE t1 (c1 INT PRIMARY KEY AUTO_INCREMENT, c2 VARCHAR(10)) ENGINE=InnoDB;
+INSERT INTO t1 VALUES (1, NULL);
+INSERT INTO t1 VALUES (-1, 'innodb');
+INSERT INTO t1 VALUES (-2147483647, 'innodb');
+INSERT INTO t1 VALUES (NULL, NULL);
+SHOW CREATE TABLE t1;
+Table Create Table
+t1 CREATE TABLE `t1` (
+ `c1` int(11) NOT NULL AUTO_INCREMENT,
+ `c2` varchar(10) DEFAULT NULL,
+ PRIMARY KEY (`c1`)
+) ENGINE=InnoDB AUTO_INCREMENT=3 DEFAULT CHARSET=latin1
+SELECT * FROM t1;
+c1 c2
+-2147483647 innodb
+-1 innodb
+1 NULL
+2 NULL
+DROP TABLE t1;
+CREATE TABLE t1 (c1 INT UNSIGNED PRIMARY KEY AUTO_INCREMENT, c2 VARCHAR(10)) ENGINE=InnoDB;
+INSERT INTO t1 VALUES (1, NULL);
+INSERT INTO t1 VALUES (-1, 'innodb');
+Warnings:
+Warning 1264 Out of range value for column 'c1' at row 1
+INSERT INTO t1 VALUES (-2147483647, 'innodb');
+Warnings:
+Warning 1264 Out of range value for column 'c1' at row 1
+INSERT INTO t1 VALUES (NULL, NULL);
+SHOW CREATE TABLE t1;
+Table Create Table
+t1 CREATE TABLE `t1` (
+ `c1` int(10) unsigned NOT NULL AUTO_INCREMENT,
+ `c2` varchar(10) DEFAULT NULL,
+ PRIMARY KEY (`c1`)
+) ENGINE=InnoDB AUTO_INCREMENT=5 DEFAULT CHARSET=latin1
+SELECT * FROM t1;
+c1 c2
+1 NULL
+2 innodb
+3 innodb
+4 NULL
+DROP TABLE t1;
+CREATE TABLE t1 (c1 BIGINT PRIMARY KEY AUTO_INCREMENT, c2 VARCHAR(10)) ENGINE=InnoDB;
+INSERT INTO t1 VALUES (1, NULL);
+INSERT INTO t1 VALUES (-1, 'innodb');
+INSERT INTO t1 VALUES (-9223372036854775807, 'innodb');
+INSERT INTO t1 VALUES (NULL, NULL);
+SHOW CREATE TABLE t1;
+Table Create Table
+t1 CREATE TABLE `t1` (
+ `c1` bigint(20) NOT NULL AUTO_INCREMENT,
+ `c2` varchar(10) DEFAULT NULL,
+ PRIMARY KEY (`c1`)
+) ENGINE=InnoDB AUTO_INCREMENT=3 DEFAULT CHARSET=latin1
+SELECT * FROM t1;
+c1 c2
+-9223372036854775807 innodb
+-1 innodb
+1 NULL
+2 NULL
+DROP TABLE t1;
+CREATE TABLE t1 (c1 BIGINT UNSIGNED PRIMARY KEY AUTO_INCREMENT, c2 VARCHAR(10)) ENGINE=InnoDB;
+INSERT INTO t1 VALUES (1, NULL);
+INSERT INTO t1 VALUES (-1, 'innodb');
+Warnings:
+Warning 1264 Out of range value for column 'c1' at row 1
+INSERT INTO t1 VALUES (-9223372036854775807, 'innodb');
+Warnings:
+Warning 1264 Out of range value for column 'c1' at row 1
+INSERT INTO t1 VALUES (NULL, NULL);
+SHOW CREATE TABLE t1;
+Table Create Table
+t1 CREATE TABLE `t1` (
+ `c1` bigint(20) unsigned NOT NULL AUTO_INCREMENT,
+ `c2` varchar(10) DEFAULT NULL,
+ PRIMARY KEY (`c1`)
+) ENGINE=InnoDB AUTO_INCREMENT=5 DEFAULT CHARSET=latin1
+SELECT * FROM t1;
+c1 c2
+1 NULL
+2 innodb
+3 innodb
+4 NULL
+DROP TABLE t1;
+CREATE TABLE t1 (c1 INT AUTO_INCREMENT, c2 INT, PRIMARY KEY(c1)) AUTO_INCREMENT=10 ENGINE=InnoDB;
+CREATE INDEX i1 on t1(c2);
+SHOW CREATE TABLE t1;
+Table Create Table
+t1 CREATE TABLE `t1` (
+ `c1` int(11) NOT NULL AUTO_INCREMENT,
+ `c2` int(11) DEFAULT NULL,
+ PRIMARY KEY (`c1`),
+ KEY `i1` (`c2`)
+) ENGINE=InnoDB AUTO_INCREMENT=10 DEFAULT CHARSET=latin1
+INSERT INTO t1 (c2) values (0);
+SELECT * FROM t1;
+c1 c2
+10 0
+DROP TABLE t1;
+DROP TABLE IF EXISTS t1;
+Warnings:
+Note 1051 Unknown table 't1'
+CREATE TABLE t1(C1 DOUBLE AUTO_INCREMENT KEY, C2 CHAR(10)) ENGINE=InnoDB;
+INSERT INTO t1(C1, C2) VALUES (1, 'innodb'), (3, 'innodb');
+INSERT INTO t1(C2) VALUES ('innodb');
+SHOW CREATE TABLE t1;
+Table Create Table
+t1 CREATE TABLE `t1` (
+ `C1` double NOT NULL AUTO_INCREMENT,
+ `C2` char(10) DEFAULT NULL,
+ PRIMARY KEY (`C1`)
+) ENGINE=InnoDB AUTO_INCREMENT=5 DEFAULT CHARSET=latin1
+DROP TABLE t1;
+CREATE TABLE t1(C1 FLOAT AUTO_INCREMENT KEY, C2 CHAR(10)) ENGINE=InnoDB;
+INSERT INTO t1(C1, C2) VALUES (1, 'innodb'), (3, 'innodb');
+INSERT INTO t1(C2) VALUES ('innodb');
+SHOW CREATE TABLE t1;
+Table Create Table
+t1 CREATE TABLE `t1` (
+ `C1` float NOT NULL AUTO_INCREMENT,
+ `C2` char(10) DEFAULT NULL,
+ PRIMARY KEY (`C1`)
+) ENGINE=InnoDB AUTO_INCREMENT=5 DEFAULT CHARSET=latin1
+DROP TABLE t1;
+DROP TABLE IF EXISTS t1;
+Warnings:
+Note 1051 Unknown table 't1'
+CREATE TABLE t1 (c1 INT AUTO_INCREMENT PRIMARY KEY) ENGINE=InnoDB;
+INSERT INTO t1 SET c1 = 1;
+SHOW CREATE TABLE t1;
+Table Create Table
+t1 CREATE TABLE `t1` (
+ `c1` int(11) NOT NULL AUTO_INCREMENT,
+ PRIMARY KEY (`c1`)
+) ENGINE=InnoDB AUTO_INCREMENT=2 DEFAULT CHARSET=latin1
+INSERT INTO t1 SET c1 = 2;
+INSERT INTO t1 SET c1 = -1;
+SELECT * FROM t1;
+c1
+-1
1
-3
+2
+INSERT INTO t1 SET c1 = -1;
+Got one of the listed errors
+SHOW CREATE TABLE t1;
+Table Create Table
+t1 CREATE TABLE `t1` (
+ `c1` int(11) NOT NULL AUTO_INCREMENT,
+ PRIMARY KEY (`c1`)
+) ENGINE=InnoDB AUTO_INCREMENT=3 DEFAULT CHARSET=latin1
+REPLACE INTO t1 VALUES (-1);
+SELECT * FROM t1;
+c1
+-1
+1
+2
+SHOW CREATE TABLE t1;
+Table Create Table
+t1 CREATE TABLE `t1` (
+ `c1` int(11) NOT NULL AUTO_INCREMENT,
+ PRIMARY KEY (`c1`)
+) ENGINE=InnoDB AUTO_INCREMENT=3 DEFAULT CHARSET=latin1
+DROP TABLE t1;
+DROP TABLE IF EXISTS t1;
+Warnings:
+Note 1051 Unknown table 't1'
+CREATE TABLE t1 (c1 INTEGER AUTO_INCREMENT, PRIMARY KEY (c1)) ENGINE=InnoDB;
+INSERT INTO t1 VALUES (-685113344), (1), (NULL), (NULL);
SELECT * FROM t1;
-d1
+c1
+-685113344
1
+2
3
-INSERT INTO t1 VALUES(null);
-Got one of the listed errors
-ALTER TABLE t1 AUTO_INCREMENT = 3;
-INSERT INTO t1 VALUES(null);
+SHOW CREATE TABLE t1;
+Table Create Table
+t1 CREATE TABLE `t1` (
+ `c1` int(11) NOT NULL AUTO_INCREMENT,
+ PRIMARY KEY (`c1`)
+) ENGINE=InnoDB AUTO_INCREMENT=6 DEFAULT CHARSET=latin1
+DROP TABLE t1;
+CREATE TABLE t1 (c1 INTEGER AUTO_INCREMENT, PRIMARY KEY (c1)) ENGINE=InnoDB;
+INSERT INTO t1 VALUES (-685113344), (2), (NULL), (NULL);
+SELECT * FROM t1;
+c1
+-685113344
+2
+3
+4
+SHOW CREATE TABLE t1;
+Table Create Table
+t1 CREATE TABLE `t1` (
+ `c1` int(11) NOT NULL AUTO_INCREMENT,
+ PRIMARY KEY (`c1`)
+) ENGINE=InnoDB AUTO_INCREMENT=7 DEFAULT CHARSET=latin1
+DROP TABLE t1;
+CREATE TABLE t1 (c1 INTEGER AUTO_INCREMENT, PRIMARY KEY (c1)) ENGINE=InnoDB;
+INSERT INTO t1 VALUES (NULL), (2), (-685113344), (NULL);
+INSERT INTO t1 VALUES (4), (5), (6), (NULL);
SELECT * FROM t1;
-d1
+c1
+-685113344
1
+2
3
4
+5
+6
+7
+SHOW CREATE TABLE t1;
+Table Create Table
+t1 CREATE TABLE `t1` (
+ `c1` int(11) NOT NULL AUTO_INCREMENT,
+ PRIMARY KEY (`c1`)
+) ENGINE=InnoDB AUTO_INCREMENT=11 DEFAULT CHARSET=latin1
+DROP TABLE t1;
+CREATE TABLE t1 (c1 INTEGER AUTO_INCREMENT, PRIMARY KEY (c1)) ENGINE=InnoDB;
+INSERT INTO t1 VALUES (NULL), (2), (-685113344), (5);
+SELECT * FROM t1;
+c1
+-685113344
+1
+2
+5
+SHOW CREATE TABLE t1;
+Table Create Table
+t1 CREATE TABLE `t1` (
+ `c1` int(11) NOT NULL AUTO_INCREMENT,
+ PRIMARY KEY (`c1`)
+) ENGINE=InnoDB AUTO_INCREMENT=6 DEFAULT CHARSET=latin1
+DROP TABLE t1;
+CREATE TABLE t1 (c1 INTEGER AUTO_INCREMENT, PRIMARY KEY (c1)) ENGINE=InnoDB;
+INSERT INTO t1 VALUES (1), (2), (-685113344), (NULL);
+SELECT * FROM t1;
+c1
+-685113344
+1
+2
+3
+SHOW CREATE TABLE t1;
+Table Create Table
+t1 CREATE TABLE `t1` (
+ `c1` int(11) NOT NULL AUTO_INCREMENT,
+ PRIMARY KEY (`c1`)
+) ENGINE=InnoDB AUTO_INCREMENT=7 DEFAULT CHARSET=latin1
DROP TABLE t1;
diff --git a/storage/innobase/mysql-test/innodb-autoinc.test b/storage/innobase/mysql-test/innodb-autoinc.test
index 61c42f45733..ef0359b78b0 100644
--- a/storage/innobase/mysql-test/innodb-autoinc.test
+++ b/storage/innobase/mysql-test/innodb-autoinc.test
@@ -478,23 +478,187 @@ INSERT INTO t2 SELECT c1 FROM t1;
INSERT INTO t2 SELECT NULL FROM t1;
DROP TABLE t1;
DROP TABLE t2;
+
+# If the user has specified negative values for an AUTOINC column then
+# InnoDB should ignore those values when setting the table's max value.
+SET @@SESSION.AUTO_INCREMENT_INCREMENT=1, @@SESSION.AUTO_INCREMENT_OFFSET=1;
+SHOW VARIABLES LIKE "%auto_inc%";
+# TINYINT
+CREATE TABLE t1 (c1 TINYINT PRIMARY KEY AUTO_INCREMENT, c2 VARCHAR(10)) ENGINE=InnoDB;
+INSERT INTO t1 VALUES (1, NULL);
+INSERT INTO t1 VALUES (-1, 'innodb');
+INSERT INTO t1 VALUES (-127, 'innodb');
+INSERT INTO t1 VALUES (NULL, NULL);
+SHOW CREATE TABLE t1;
+SELECT * FROM t1;
+DROP TABLE t1;
+
+CREATE TABLE t1 (c1 TINYINT UNSIGNED PRIMARY KEY AUTO_INCREMENT, c2 VARCHAR(10)) ENGINE=InnoDB;
+INSERT INTO t1 VALUES (1, NULL);
+INSERT INTO t1 VALUES (-1, 'innodb');
+INSERT INTO t1 VALUES (-127, 'innodb');
+INSERT INTO t1 VALUES (NULL, NULL);
+SHOW CREATE TABLE t1;
+SELECT * FROM t1;
+DROP TABLE t1;
+#
+# SMALLINT
+#
+CREATE TABLE t1 (c1 SMALLINT PRIMARY KEY AUTO_INCREMENT, c2 VARCHAR(10)) ENGINE=InnoDB;
+INSERT INTO t1 VALUES (1, NULL);
+INSERT INTO t1 VALUES (-1, 'innodb');
+INSERT INTO t1 VALUES (-32767, 'innodb');
+INSERT INTO t1 VALUES (NULL, NULL);
+SHOW CREATE TABLE t1;
+SELECT * FROM t1;
+DROP TABLE t1;
+
+CREATE TABLE t1 (c1 SMALLINT UNSIGNED PRIMARY KEY AUTO_INCREMENT, c2 VARCHAR(10)) ENGINE=InnoDB;
+INSERT INTO t1 VALUES (1, NULL);
+INSERT INTO t1 VALUES (-1, 'innodb');
+INSERT INTO t1 VALUES (-32757, 'innodb');
+INSERT INTO t1 VALUES (NULL, NULL);
+SHOW CREATE TABLE t1;
+SELECT * FROM t1;
+DROP TABLE t1;
+#
+# MEDIUMINT
+#
+CREATE TABLE t1 (c1 MEDIUMINT PRIMARY KEY AUTO_INCREMENT, c2 VARCHAR(10)) ENGINE=InnoDB;
+INSERT INTO t1 VALUES (1, NULL);
+INSERT INTO t1 VALUES (-1, 'innodb');
+INSERT INTO t1 VALUES (-8388607, 'innodb');
+INSERT INTO t1 VALUES (NULL, NULL);
+SHOW CREATE TABLE t1;
+SELECT * FROM t1;
+DROP TABLE t1;
+
+CREATE TABLE t1 (c1 MEDIUMINT UNSIGNED PRIMARY KEY AUTO_INCREMENT, c2 VARCHAR(10)) ENGINE=InnoDB;
+INSERT INTO t1 VALUES (1, NULL);
+INSERT INTO t1 VALUES (-1, 'innodb');
+INSERT INTO t1 VALUES (-8388607, 'innodb');
+INSERT INTO t1 VALUES (NULL, NULL);
+SHOW CREATE TABLE t1;
+SELECT * FROM t1;
+DROP TABLE t1;
+#
+# INT
+#
+CREATE TABLE t1 (c1 INT PRIMARY KEY AUTO_INCREMENT, c2 VARCHAR(10)) ENGINE=InnoDB;
+INSERT INTO t1 VALUES (1, NULL);
+INSERT INTO t1 VALUES (-1, 'innodb');
+INSERT INTO t1 VALUES (-2147483647, 'innodb');
+INSERT INTO t1 VALUES (NULL, NULL);
+SHOW CREATE TABLE t1;
+SELECT * FROM t1;
+DROP TABLE t1;
+
+CREATE TABLE t1 (c1 INT UNSIGNED PRIMARY KEY AUTO_INCREMENT, c2 VARCHAR(10)) ENGINE=InnoDB;
+INSERT INTO t1 VALUES (1, NULL);
+INSERT INTO t1 VALUES (-1, 'innodb');
+INSERT INTO t1 VALUES (-2147483647, 'innodb');
+INSERT INTO t1 VALUES (NULL, NULL);
+SHOW CREATE TABLE t1;
+SELECT * FROM t1;
+DROP TABLE t1;
+#
+# BIGINT
+#
+CREATE TABLE t1 (c1 BIGINT PRIMARY KEY AUTO_INCREMENT, c2 VARCHAR(10)) ENGINE=InnoDB;
+INSERT INTO t1 VALUES (1, NULL);
+INSERT INTO t1 VALUES (-1, 'innodb');
+INSERT INTO t1 VALUES (-9223372036854775807, 'innodb');
+INSERT INTO t1 VALUES (NULL, NULL);
+SHOW CREATE TABLE t1;
+SELECT * FROM t1;
+DROP TABLE t1;
+
+CREATE TABLE t1 (c1 BIGINT UNSIGNED PRIMARY KEY AUTO_INCREMENT, c2 VARCHAR(10)) ENGINE=InnoDB;
+INSERT INTO t1 VALUES (1, NULL);
+INSERT INTO t1 VALUES (-1, 'innodb');
+INSERT INTO t1 VALUES (-9223372036854775807, 'innodb');
+INSERT INTO t1 VALUES (NULL, NULL);
+SHOW CREATE TABLE t1;
+SELECT * FROM t1;
+DROP TABLE t1;
+#
+# End negative number check
+
+##
+# 47125: auto_increment start value is ignored if an index is created
+# and engine=innodb
#
-# 44030: Error: (1500) Couldn't read the MAX(ID) autoinc value from
-# the index (PRIMARY)
-# This test requires a restart of the server
-CREATE TABLE t1 (c1 INT PRIMARY KEY AUTO_INCREMENT) ENGINE=InnoDB;
-INSERT INTO t1 VALUES (null);
-INSERT INTO t1 VALUES (null);
-ALTER TABLE t1 CHANGE c1 d1 INT NOT NULL AUTO_INCREMENT;
+CREATE TABLE t1 (c1 INT AUTO_INCREMENT, c2 INT, PRIMARY KEY(c1)) AUTO_INCREMENT=10 ENGINE=InnoDB;
+CREATE INDEX i1 on t1(c2);
+SHOW CREATE TABLE t1;
+INSERT INTO t1 (c2) values (0);
SELECT * FROM t1;
+DROP TABLE t1;
+
+##
+# 49032: Use the correct function to read the AUTOINC column value
+#
+DROP TABLE IF EXISTS t1;
+CREATE TABLE t1(C1 DOUBLE AUTO_INCREMENT KEY, C2 CHAR(10)) ENGINE=InnoDB;
+INSERT INTO t1(C1, C2) VALUES (1, 'innodb'), (3, 'innodb');
# Restart the server
-- source include/restart_mysqld.inc
-# The MySQL and InnoDB data dictionaries should now be out of sync.
-# The select should print message to the error log
+INSERT INTO t1(C2) VALUES ('innodb');
+SHOW CREATE TABLE t1;
+DROP TABLE t1;
+CREATE TABLE t1(C1 FLOAT AUTO_INCREMENT KEY, C2 CHAR(10)) ENGINE=InnoDB;
+INSERT INTO t1(C1, C2) VALUES (1, 'innodb'), (3, 'innodb');
+# Restart the server
+-- source include/restart_mysqld.inc
+INSERT INTO t1(C2) VALUES ('innodb');
+SHOW CREATE TABLE t1;
+DROP TABLE t1;
+
+##
+# 47720: REPLACE INTO Autoincrement column with negative values
+#
+DROP TABLE IF EXISTS t1;
+CREATE TABLE t1 (c1 INT AUTO_INCREMENT PRIMARY KEY) ENGINE=InnoDB;
+INSERT INTO t1 SET c1 = 1;
+SHOW CREATE TABLE t1;
+INSERT INTO t1 SET c1 = 2;
+INSERT INTO t1 SET c1 = -1;
SELECT * FROM t1;
--- error ER_AUTOINC_READ_FAILED,1467
-INSERT INTO t1 VALUES(null);
-ALTER TABLE t1 AUTO_INCREMENT = 3;
-INSERT INTO t1 VALUES(null);
+-- error ER_DUP_ENTRY,1062
+INSERT INTO t1 SET c1 = -1;
+SHOW CREATE TABLE t1;
+REPLACE INTO t1 VALUES (-1);
+SELECT * FROM t1;
+SHOW CREATE TABLE t1;
+DROP TABLE t1;
+
+##
+# 49497: Error 1467 (ER_AUTOINC_READ_FAILED) on inserting a negative value
+#
+DROP TABLE IF EXISTS t1;
+CREATE TABLE t1 (c1 INTEGER AUTO_INCREMENT, PRIMARY KEY (c1)) ENGINE=InnoDB;
+INSERT INTO t1 VALUES (-685113344), (1), (NULL), (NULL);
SELECT * FROM t1;
+SHOW CREATE TABLE t1;
+DROP TABLE t1;
+CREATE TABLE t1 (c1 INTEGER AUTO_INCREMENT, PRIMARY KEY (c1)) ENGINE=InnoDB;
+INSERT INTO t1 VALUES (-685113344), (2), (NULL), (NULL);
+SELECT * FROM t1;
+SHOW CREATE TABLE t1;
+DROP TABLE t1;
+CREATE TABLE t1 (c1 INTEGER AUTO_INCREMENT, PRIMARY KEY (c1)) ENGINE=InnoDB;
+INSERT INTO t1 VALUES (NULL), (2), (-685113344), (NULL);
+INSERT INTO t1 VALUES (4), (5), (6), (NULL);
+SELECT * FROM t1;
+SHOW CREATE TABLE t1;
+DROP TABLE t1;
+CREATE TABLE t1 (c1 INTEGER AUTO_INCREMENT, PRIMARY KEY (c1)) ENGINE=InnoDB;
+INSERT INTO t1 VALUES (NULL), (2), (-685113344), (5);
+SELECT * FROM t1;
+SHOW CREATE TABLE t1;
+DROP TABLE t1;
+CREATE TABLE t1 (c1 INTEGER AUTO_INCREMENT, PRIMARY KEY (c1)) ENGINE=InnoDB;
+INSERT INTO t1 VALUES (1), (2), (-685113344), (NULL);
+SELECT * FROM t1;
+SHOW CREATE TABLE t1;
DROP TABLE t1;
diff --git a/storage/innobase/mysql-test/innodb-consistent-master.opt b/storage/innobase/mysql-test/innodb-consistent-master.opt
index 8cca44767da..cb48f1aaf60 100644
--- a/storage/innobase/mysql-test/innodb-consistent-master.opt
+++ b/storage/innobase/mysql-test/innodb-consistent-master.opt
@@ -1 +1 @@
---innodb_lock_wait_timeout=2
+--loose-innodb_lock_wait_timeout=2
diff --git a/storage/innobase/mysql-test/innodb-consistent.test b/storage/innobase/mysql-test/innodb-consistent.test
index 791600fc8a7..bf829a74ea2 100644
--- a/storage/innobase/mysql-test/innodb-consistent.test
+++ b/storage/innobase/mysql-test/innodb-consistent.test
@@ -1,58 +1,58 @@
--- source include/not_embedded.inc
--- source include/have_innodb.inc
-
---disable_warnings
-drop table if exists t1;
---enable_warnings
-
-# REPLACE INTO ... SELECT and INSERT INTO ... SELECT should do
-# a consistent read of the source table.
-
-connect (a,localhost,root,,);
-connect (b,localhost,root,,);
-connection a;
-set session transaction isolation level read committed;
-create table t1(a int not null) engine=innodb DEFAULT CHARSET=latin1;
-create table t2 like t1;
-insert into t2 values (1),(2),(3),(4),(5),(6),(7);
-set autocommit=0;
-
-# REPLACE INTO ... SELECT case
-begin;
-# this should not result in any locks on t2.
-replace into t1 select * from t2;
-
-connection b;
-set session transaction isolation level read committed;
-set autocommit=0;
-# should not cuase a lock wait.
-delete from t2 where a=5;
-commit;
-delete from t2;
-commit;
-connection a;
-commit;
-
-# INSERT INTO ... SELECT case
-begin;
-# this should not result in any locks on t2.
-insert into t1 select * from t2;
-
-connection b;
-set session transaction isolation level read committed;
-set autocommit=0;
-# should not cuase a lock wait.
-delete from t2 where a=5;
-commit;
-delete from t2;
-commit;
-connection a;
-commit;
-
-select * from t1;
-drop table t1;
-drop table t2;
-
-connection default;
-disconnect a;
-disconnect b;
+-- source include/not_embedded.inc
+-- source include/have_innodb.inc
+
+--disable_warnings
+drop table if exists t1;
+--enable_warnings
+
+# REPLACE INTO ... SELECT and INSERT INTO ... SELECT should do
+# a consistent read of the source table.
+
+connect (a,localhost,root,,);
+connect (b,localhost,root,,);
+connection a;
+set session transaction isolation level read committed;
+create table t1(a int not null) engine=innodb DEFAULT CHARSET=latin1;
+create table t2 like t1;
+insert into t2 values (1),(2),(3),(4),(5),(6),(7);
+set autocommit=0;
+
+# REPLACE INTO ... SELECT case
+begin;
+# this should not result in any locks on t2.
+replace into t1 select * from t2;
+
+connection b;
+set session transaction isolation level read committed;
+set autocommit=0;
+# should not cause a lock wait.
+delete from t2 where a=5;
+commit;
+delete from t2;
+commit;
+connection a;
+commit;
+
+# INSERT INTO ... SELECT case
+begin;
+# this should not result in any locks on t2.
+insert into t1 select * from t2;
+
+connection b;
+set session transaction isolation level read committed;
+set autocommit=0;
+# should not cause a lock wait.
+delete from t2 where a=5;
+commit;
+delete from t2;
+commit;
+connection a;
+commit;
+
+select * from t1;
+drop table t1;
+drop table t2;
+
+connection default;
+disconnect a;
+disconnect b;
diff --git a/storage/innobase/mysql-test/innodb-index.result b/storage/innobase/mysql-test/innodb-index.result
index a7d66b15300..f384b825a2c 100644
--- a/storage/innobase/mysql-test/innodb-index.result
+++ b/storage/innobase/mysql-test/innodb-index.result
@@ -46,13 +46,6 @@ t1 CREATE TABLE `t1` (
KEY `d2` (`d`),
KEY `b` (`b`)
) ENGINE=InnoDB DEFAULT CHARSET=latin1
-CREATE TABLE `t1#1`(a INT PRIMARY KEY) ENGINE=InnoDB;
-alter table t1 add unique index (c), add index (d);
-ERROR HY000: Table 'test.t1#1' already exists
-rename table `t1#1` to `t1#2`;
-alter table t1 add unique index (c), add index (d);
-ERROR HY000: Table 'test.t1#2' already exists
-drop table `t1#2`;
alter table t1 add unique index (c), add index (d);
show create table t1;
Table Create Table
@@ -441,6 +434,7 @@ t3 CREATE TABLE `t3` (
KEY `c` (`c`)
) ENGINE=InnoDB DEFAULT CHARSET=latin1
alter table t2 drop index b, add index (b);
+ERROR 42000: Incorrect index name 'b'
show create table t2;
Table Create Table
t2 CREATE TABLE `t2` (
@@ -451,8 +445,8 @@ t2 CREATE TABLE `t2` (
`e` int(11) DEFAULT NULL,
PRIMARY KEY (`a`),
UNIQUE KEY `dc` (`d`,`c`),
- KEY `c` (`c`),
KEY `b` (`b`),
+ KEY `c` (`c`),
CONSTRAINT `t2_ibfk_1` FOREIGN KEY (`b`) REFERENCES `t1` (`b`) ON DELETE CASCADE,
CONSTRAINT `t2_ibfk_2` FOREIGN KEY (`c`) REFERENCES `t3` (`c`),
CONSTRAINT `t2_ibfk_3` FOREIGN KEY (`d`) REFERENCES `t4` (`d`)
@@ -968,6 +962,7 @@ create index t1u on t1 (u(1));
drop table t1;
set global innodb_file_per_table=0;
set global innodb_file_format=Antelope;
+set global innodb_file_format_check=Antelope;
SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0;
SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0;
CREATE TABLE t1(
diff --git a/storage/innobase/mysql-test/innodb-index.test b/storage/innobase/mysql-test/innodb-index.test
index 42888ff3686..da1bc543ae9 100644
--- a/storage/innobase/mysql-test/innodb-index.test
+++ b/storage/innobase/mysql-test/innodb-index.test
@@ -1,5 +1,9 @@
-- source include/have_innodb.inc
+let $MYSQLD_DATADIR= `select @@datadir`;
+
+let $innodb_file_format_check_orig=`select @@innodb_file_format_check`;
+
create table t1(a int not null, b int, c char(10) not null, d varchar(20)) engine = innodb;
insert into t1 values (5,5,'oo','oo'),(4,4,'tr','tr'),(3,4,'ad','ad'),(2,3,'ak','ak');
commit;
@@ -17,16 +21,6 @@ show create table t1;
alter table t1 add index (b);
show create table t1;
-# Check how existing tables interfere with temporary tables.
-CREATE TABLE `t1#1`(a INT PRIMARY KEY) ENGINE=InnoDB;
-
---error 156
-alter table t1 add unique index (c), add index (d);
-rename table `t1#1` to `t1#2`;
---error 156
-alter table t1 add unique index (c), add index (d);
-drop table `t1#2`;
-
alter table t1 add unique index (c), add index (d);
show create table t1;
explain select * from t1 force index(c) order by c;
@@ -137,6 +131,8 @@ show create table t4;
--error ER_CANT_CREATE_TABLE
alter table t3 add constraint dc foreign key (a) references t1(a);
show create table t3;
+# this should be fixed by MySQL (see Bug #51451)
+--error ER_WRONG_NAME_FOR_INDEX
alter table t2 drop index b, add index (b);
show create table t2;
--error ER_ROW_IS_REFERENCED_2
@@ -144,7 +140,9 @@ delete from t1;
--error ER_CANT_DROP_FIELD_OR_KEY
drop index dc on t4;
# there is no foreign key dc on t3
---replace_regex /'\.\/test\/#sql2-[0-9a-f-]*'/'#sql2-temporary'/
+--replace_regex /'[^']*test\/#sql2-[0-9a-f-]*'/'#sql2-temporary'/
+# Embedded server doesn't chdir to data directory
+--replace_result $MYSQLD_DATADIR ./ master-data/ ''
--error ER_ERROR_ON_RENAME
alter table t3 drop foreign key dc;
alter table t4 drop foreign key dc;
@@ -398,6 +396,7 @@ create index t1u on t1 (u(1));
drop table t1;
eval set global innodb_file_per_table=$per_table;
eval set global innodb_file_format=$format;
+eval set global innodb_file_format_check=$format;
#
# Test to check whether CREATE INDEX handles implicit foreign key
@@ -532,3 +531,10 @@ disconnect a;
disconnect b;
DROP TABLE t1;
+
+#
+# restore environment to the state it was before this test execution
+#
+
+-- disable_query_log
+eval SET GLOBAL innodb_file_format_check=$innodb_file_format_check_orig;
diff --git a/storage/innobase/mysql-test/innodb-master.opt b/storage/innobase/mysql-test/innodb-master.opt
index 4901efb416c..72c88068345 100644
--- a/storage/innobase/mysql-test/innodb-master.opt
+++ b/storage/innobase/mysql-test/innodb-master.opt
@@ -1 +1 @@
---binlog_cache_size=32768 --innodb_lock_wait_timeout=1
+--binlog_cache_size=32768 --loose_innodb_lock_wait_timeout=1
diff --git a/storage/innobase/mysql-test/innodb-semi-consistent-master.opt b/storage/innobase/mysql-test/innodb-semi-consistent-master.opt
index e76299453d3..cb48f1aaf60 100644
--- a/storage/innobase/mysql-test/innodb-semi-consistent-master.opt
+++ b/storage/innobase/mysql-test/innodb-semi-consistent-master.opt
@@ -1 +1 @@
---innodb_lock_wait_timeout=2
+--loose-innodb_lock_wait_timeout=2
diff --git a/storage/innobase/mysql-test/innodb-use-sys-malloc-master.opt b/storage/innobase/mysql-test/innodb-use-sys-malloc-master.opt
index 889834add01..fc8582b5887 100644
--- a/storage/innobase/mysql-test/innodb-use-sys-malloc-master.opt
+++ b/storage/innobase/mysql-test/innodb-use-sys-malloc-master.opt
@@ -1,2 +1 @@
---innodb-use-sys-malloc=true
---innodb-use-sys-malloc=true
+--loose-innodb-use-sys-malloc=true
diff --git a/storage/innobase/mysql-test/innodb-zip.result b/storage/innobase/mysql-test/innodb-zip.result
index b26c4112826..21396d81ba8 100644
--- a/storage/innobase/mysql-test/innodb-zip.result
+++ b/storage/innobase/mysql-test/innodb-zip.result
@@ -196,15 +196,15 @@ drop table t1;
set innodb_strict_mode = on;
create table t1 (id int primary key) engine = innodb key_block_size = 0;
ERROR HY000: Can't create table 'test.t1' (errno: 1478)
-show errors;
+show warnings;
Level Code Message
-Error 1478 InnoDB: invalid KEY_BLOCK_SIZE = 0. Valid values are [1, 2, 4, 8, 16]
+Warning 1478 InnoDB: invalid KEY_BLOCK_SIZE = 0. Valid values are [1, 2, 4, 8, 16]
Error 1005 Can't create table 'test.t1' (errno: 1478)
create table t2 (id int primary key) engine = innodb key_block_size = 9;
ERROR HY000: Can't create table 'test.t2' (errno: 1478)
-show errors;
+show warnings;
Level Code Message
-Error 1478 InnoDB: invalid KEY_BLOCK_SIZE = 9. Valid values are [1, 2, 4, 8, 16]
+Warning 1478 InnoDB: invalid KEY_BLOCK_SIZE = 9. Valid values are [1, 2, 4, 8, 16]
Error 1005 Can't create table 'test.t2' (errno: 1478)
create table t3 (id int primary key) engine = innodb key_block_size = 1;
create table t4 (id int primary key) engine = innodb key_block_size = 2;
@@ -233,30 +233,30 @@ key_block_size = 8 row_format = compressed;
create table t2 (id int primary key) engine = innodb
key_block_size = 8 row_format = redundant;
ERROR HY000: Can't create table 'test.t2' (errno: 1478)
-show errors;
+show warnings;
Level Code Message
-Error 1478 InnoDB: cannot specify ROW_FORMAT = REDUNDANT with KEY_BLOCK_SIZE.
+Warning 1478 InnoDB: cannot specify ROW_FORMAT = REDUNDANT with KEY_BLOCK_SIZE.
Error 1005 Can't create table 'test.t2' (errno: 1478)
create table t3 (id int primary key) engine = innodb
key_block_size = 8 row_format = compact;
ERROR HY000: Can't create table 'test.t3' (errno: 1478)
-show errors;
+show warnings;
Level Code Message
-Error 1478 InnoDB: cannot specify ROW_FORMAT = COMPACT with KEY_BLOCK_SIZE.
+Warning 1478 InnoDB: cannot specify ROW_FORMAT = COMPACT with KEY_BLOCK_SIZE.
Error 1005 Can't create table 'test.t3' (errno: 1478)
create table t4 (id int primary key) engine = innodb
key_block_size = 8 row_format = dynamic;
ERROR HY000: Can't create table 'test.t4' (errno: 1478)
-show errors;
+show warnings;
Level Code Message
-Error 1478 InnoDB: cannot specify ROW_FORMAT = DYNAMIC with KEY_BLOCK_SIZE.
+Warning 1478 InnoDB: cannot specify ROW_FORMAT = DYNAMIC with KEY_BLOCK_SIZE.
Error 1005 Can't create table 'test.t4' (errno: 1478)
create table t5 (id int primary key) engine = innodb
key_block_size = 8 row_format = default;
ERROR HY000: Can't create table 'test.t5' (errno: 1478)
-show errors;
+show warnings;
Level Code Message
-Error 1478 InnoDB: cannot specify ROW_FORMAT = COMPACT with KEY_BLOCK_SIZE.
+Warning 1478 InnoDB: cannot specify ROW_FORMAT = COMPACT with KEY_BLOCK_SIZE.
Error 1005 Can't create table 'test.t5' (errno: 1478)
SELECT table_schema, table_name, row_format
FROM information_schema.tables WHERE engine='innodb';
@@ -266,26 +266,26 @@ drop table t1;
create table t1 (id int primary key) engine = innodb
key_block_size = 9 row_format = redundant;
ERROR HY000: Can't create table 'test.t1' (errno: 1478)
-show errors;
+show warnings;
Level Code Message
-Error 1478 InnoDB: invalid KEY_BLOCK_SIZE = 9. Valid values are [1, 2, 4, 8, 16]
-Error 1478 InnoDB: cannot specify ROW_FORMAT = REDUNDANT with KEY_BLOCK_SIZE.
+Warning 1478 InnoDB: invalid KEY_BLOCK_SIZE = 9. Valid values are [1, 2, 4, 8, 16]
+Warning 1478 InnoDB: cannot specify ROW_FORMAT = REDUNDANT with KEY_BLOCK_SIZE.
Error 1005 Can't create table 'test.t1' (errno: 1478)
create table t2 (id int primary key) engine = innodb
key_block_size = 9 row_format = compact;
ERROR HY000: Can't create table 'test.t2' (errno: 1478)
-show errors;
+show warnings;
Level Code Message
-Error 1478 InnoDB: invalid KEY_BLOCK_SIZE = 9. Valid values are [1, 2, 4, 8, 16]
-Error 1478 InnoDB: cannot specify ROW_FORMAT = COMPACT with KEY_BLOCK_SIZE.
+Warning 1478 InnoDB: invalid KEY_BLOCK_SIZE = 9. Valid values are [1, 2, 4, 8, 16]
+Warning 1478 InnoDB: cannot specify ROW_FORMAT = COMPACT with KEY_BLOCK_SIZE.
Error 1005 Can't create table 'test.t2' (errno: 1478)
create table t2 (id int primary key) engine = innodb
key_block_size = 9 row_format = dynamic;
ERROR HY000: Can't create table 'test.t2' (errno: 1478)
-show errors;
+show warnings;
Level Code Message
-Error 1478 InnoDB: invalid KEY_BLOCK_SIZE = 9. Valid values are [1, 2, 4, 8, 16]
-Error 1478 InnoDB: cannot specify ROW_FORMAT = DYNAMIC with KEY_BLOCK_SIZE.
+Warning 1478 InnoDB: invalid KEY_BLOCK_SIZE = 9. Valid values are [1, 2, 4, 8, 16]
+Warning 1478 InnoDB: cannot specify ROW_FORMAT = DYNAMIC with KEY_BLOCK_SIZE.
Error 1005 Can't create table 'test.t2' (errno: 1478)
SELECT table_schema, table_name, row_format
FROM information_schema.tables WHERE engine='innodb';
@@ -293,45 +293,45 @@ table_schema table_name row_format
set global innodb_file_per_table = off;
create table t1 (id int primary key) engine = innodb key_block_size = 1;
ERROR HY000: Can't create table 'test.t1' (errno: 1478)
-show errors;
+show warnings;
Level Code Message
-Error 1478 InnoDB: KEY_BLOCK_SIZE requires innodb_file_per_table.
+Warning 1478 InnoDB: KEY_BLOCK_SIZE requires innodb_file_per_table.
Error 1005 Can't create table 'test.t1' (errno: 1478)
create table t2 (id int primary key) engine = innodb key_block_size = 2;
ERROR HY000: Can't create table 'test.t2' (errno: 1478)
-show errors;
+show warnings;
Level Code Message
-Error 1478 InnoDB: KEY_BLOCK_SIZE requires innodb_file_per_table.
+Warning 1478 InnoDB: KEY_BLOCK_SIZE requires innodb_file_per_table.
Error 1005 Can't create table 'test.t2' (errno: 1478)
create table t3 (id int primary key) engine = innodb key_block_size = 4;
ERROR HY000: Can't create table 'test.t3' (errno: 1478)
-show errors;
+show warnings;
Level Code Message
-Error 1478 InnoDB: KEY_BLOCK_SIZE requires innodb_file_per_table.
+Warning 1478 InnoDB: KEY_BLOCK_SIZE requires innodb_file_per_table.
Error 1005 Can't create table 'test.t3' (errno: 1478)
create table t4 (id int primary key) engine = innodb key_block_size = 8;
ERROR HY000: Can't create table 'test.t4' (errno: 1478)
-show errors;
+show warnings;
Level Code Message
-Error 1478 InnoDB: KEY_BLOCK_SIZE requires innodb_file_per_table.
+Warning 1478 InnoDB: KEY_BLOCK_SIZE requires innodb_file_per_table.
Error 1005 Can't create table 'test.t4' (errno: 1478)
create table t5 (id int primary key) engine = innodb key_block_size = 16;
ERROR HY000: Can't create table 'test.t5' (errno: 1478)
-show errors;
+show warnings;
Level Code Message
-Error 1478 InnoDB: KEY_BLOCK_SIZE requires innodb_file_per_table.
+Warning 1478 InnoDB: KEY_BLOCK_SIZE requires innodb_file_per_table.
Error 1005 Can't create table 'test.t5' (errno: 1478)
create table t6 (id int primary key) engine = innodb row_format = compressed;
ERROR HY000: Can't create table 'test.t6' (errno: 1478)
-show errors;
+show warnings;
Level Code Message
-Error 1478 InnoDB: ROW_FORMAT=COMPRESSED requires innodb_file_per_table.
+Warning 1478 InnoDB: ROW_FORMAT=COMPRESSED requires innodb_file_per_table.
Error 1005 Can't create table 'test.t6' (errno: 1478)
create table t7 (id int primary key) engine = innodb row_format = dynamic;
ERROR HY000: Can't create table 'test.t7' (errno: 1478)
-show errors;
+show warnings;
Level Code Message
-Error 1478 InnoDB: ROW_FORMAT=DYNAMIC requires innodb_file_per_table.
+Warning 1478 InnoDB: ROW_FORMAT=DYNAMIC requires innodb_file_per_table.
Error 1005 Can't create table 'test.t7' (errno: 1478)
create table t8 (id int primary key) engine = innodb row_format = compact;
create table t9 (id int primary key) engine = innodb row_format = redundant;
@@ -345,45 +345,45 @@ set global innodb_file_per_table = on;
set global innodb_file_format = `0`;
create table t1 (id int primary key) engine = innodb key_block_size = 1;
ERROR HY000: Can't create table 'test.t1' (errno: 1478)
-show errors;
+show warnings;
Level Code Message
-Error 1478 InnoDB: KEY_BLOCK_SIZE requires innodb_file_format > Antelope.
+Warning 1478 InnoDB: KEY_BLOCK_SIZE requires innodb_file_format > Antelope.
Error 1005 Can't create table 'test.t1' (errno: 1478)
create table t2 (id int primary key) engine = innodb key_block_size = 2;
ERROR HY000: Can't create table 'test.t2' (errno: 1478)
-show errors;
+show warnings;
Level Code Message
-Error 1478 InnoDB: KEY_BLOCK_SIZE requires innodb_file_format > Antelope.
+Warning 1478 InnoDB: KEY_BLOCK_SIZE requires innodb_file_format > Antelope.
Error 1005 Can't create table 'test.t2' (errno: 1478)
create table t3 (id int primary key) engine = innodb key_block_size = 4;
ERROR HY000: Can't create table 'test.t3' (errno: 1478)
-show errors;
+show warnings;
Level Code Message
-Error 1478 InnoDB: KEY_BLOCK_SIZE requires innodb_file_format > Antelope.
+Warning 1478 InnoDB: KEY_BLOCK_SIZE requires innodb_file_format > Antelope.
Error 1005 Can't create table 'test.t3' (errno: 1478)
create table t4 (id int primary key) engine = innodb key_block_size = 8;
ERROR HY000: Can't create table 'test.t4' (errno: 1478)
-show errors;
+show warnings;
Level Code Message
-Error 1478 InnoDB: KEY_BLOCK_SIZE requires innodb_file_format > Antelope.
+Warning 1478 InnoDB: KEY_BLOCK_SIZE requires innodb_file_format > Antelope.
Error 1005 Can't create table 'test.t4' (errno: 1478)
create table t5 (id int primary key) engine = innodb key_block_size = 16;
ERROR HY000: Can't create table 'test.t5' (errno: 1478)
-show errors;
+show warnings;
Level Code Message
-Error 1478 InnoDB: KEY_BLOCK_SIZE requires innodb_file_format > Antelope.
+Warning 1478 InnoDB: KEY_BLOCK_SIZE requires innodb_file_format > Antelope.
Error 1005 Can't create table 'test.t5' (errno: 1478)
create table t6 (id int primary key) engine = innodb row_format = compressed;
ERROR HY000: Can't create table 'test.t6' (errno: 1478)
-show errors;
+show warnings;
Level Code Message
-Error 1478 InnoDB: ROW_FORMAT=COMPRESSED requires innodb_file_format > Antelope.
+Warning 1478 InnoDB: ROW_FORMAT=COMPRESSED requires innodb_file_format > Antelope.
Error 1005 Can't create table 'test.t6' (errno: 1478)
create table t7 (id int primary key) engine = innodb row_format = dynamic;
ERROR HY000: Can't create table 'test.t7' (errno: 1478)
-show errors;
+show warnings;
Level Code Message
-Error 1478 InnoDB: ROW_FORMAT=DYNAMIC requires innodb_file_format > Antelope.
+Warning 1478 InnoDB: ROW_FORMAT=DYNAMIC requires innodb_file_format > Antelope.
Error 1005 Can't create table 'test.t7' (errno: 1478)
create table t8 (id int primary key) engine = innodb row_format = compact;
create table t9 (id int primary key) engine = innodb row_format = redundant;
diff --git a/storage/innobase/mysql-test/innodb-zip.test b/storage/innobase/mysql-test/innodb-zip.test
index 5bcd0e3c824..fdb9b89e37a 100644
--- a/storage/innobase/mysql-test/innodb-zip.test
+++ b/storage/innobase/mysql-test/innodb-zip.test
@@ -174,11 +174,11 @@ set innodb_strict_mode = on;
--error ER_CANT_CREATE_TABLE
create table t1 (id int primary key) engine = innodb key_block_size = 0;
-show errors;
+show warnings;
--error ER_CANT_CREATE_TABLE
create table t2 (id int primary key) engine = innodb key_block_size = 9;
-show errors;
+show warnings;
create table t3 (id int primary key) engine = innodb key_block_size = 1;
@@ -204,22 +204,22 @@ key_block_size = 8 row_format = compressed;
--error ER_CANT_CREATE_TABLE
create table t2 (id int primary key) engine = innodb
key_block_size = 8 row_format = redundant;
-show errors;
+show warnings;
--error ER_CANT_CREATE_TABLE
create table t3 (id int primary key) engine = innodb
key_block_size = 8 row_format = compact;
-show errors;
+show warnings;
--error ER_CANT_CREATE_TABLE
create table t4 (id int primary key) engine = innodb
key_block_size = 8 row_format = dynamic;
-show errors;
+show warnings;
--error ER_CANT_CREATE_TABLE
create table t5 (id int primary key) engine = innodb
key_block_size = 8 row_format = default;
-show errors;
+show warnings;
SELECT table_schema, table_name, row_format
FROM information_schema.tables WHERE engine='innodb';
@@ -229,17 +229,17 @@ drop table t1;
--error ER_CANT_CREATE_TABLE
create table t1 (id int primary key) engine = innodb
key_block_size = 9 row_format = redundant;
-show errors;
+show warnings;
--error ER_CANT_CREATE_TABLE
create table t2 (id int primary key) engine = innodb
key_block_size = 9 row_format = compact;
-show errors;
+show warnings;
--error ER_CANT_CREATE_TABLE
create table t2 (id int primary key) engine = innodb
key_block_size = 9 row_format = dynamic;
-show errors;
+show warnings;
SELECT table_schema, table_name, row_format
FROM information_schema.tables WHERE engine='innodb';
@@ -249,25 +249,25 @@ set global innodb_file_per_table = off;
--error ER_CANT_CREATE_TABLE
create table t1 (id int primary key) engine = innodb key_block_size = 1;
-show errors;
+show warnings;
--error ER_CANT_CREATE_TABLE
create table t2 (id int primary key) engine = innodb key_block_size = 2;
-show errors;
+show warnings;
--error ER_CANT_CREATE_TABLE
create table t3 (id int primary key) engine = innodb key_block_size = 4;
-show errors;
+show warnings;
--error ER_CANT_CREATE_TABLE
create table t4 (id int primary key) engine = innodb key_block_size = 8;
-show errors;
+show warnings;
--error ER_CANT_CREATE_TABLE
create table t5 (id int primary key) engine = innodb key_block_size = 16;
-show errors;
+show warnings;
--error ER_CANT_CREATE_TABLE
create table t6 (id int primary key) engine = innodb row_format = compressed;
-show errors;
+show warnings;
--error ER_CANT_CREATE_TABLE
create table t7 (id int primary key) engine = innodb row_format = dynamic;
-show errors;
+show warnings;
create table t8 (id int primary key) engine = innodb row_format = compact;
create table t9 (id int primary key) engine = innodb row_format = redundant;
@@ -281,25 +281,25 @@ set global innodb_file_format = `0`;
--error ER_CANT_CREATE_TABLE
create table t1 (id int primary key) engine = innodb key_block_size = 1;
-show errors;
+show warnings;
--error ER_CANT_CREATE_TABLE
create table t2 (id int primary key) engine = innodb key_block_size = 2;
-show errors;
+show warnings;
--error ER_CANT_CREATE_TABLE
create table t3 (id int primary key) engine = innodb key_block_size = 4;
-show errors;
+show warnings;
--error ER_CANT_CREATE_TABLE
create table t4 (id int primary key) engine = innodb key_block_size = 8;
-show errors;
+show warnings;
--error ER_CANT_CREATE_TABLE
create table t5 (id int primary key) engine = innodb key_block_size = 16;
-show errors;
+show warnings;
--error ER_CANT_CREATE_TABLE
create table t6 (id int primary key) engine = innodb row_format = compressed;
-show errors;
+show warnings;
--error ER_CANT_CREATE_TABLE
create table t7 (id int primary key) engine = innodb row_format = dynamic;
-show errors;
+show warnings;
create table t8 (id int primary key) engine = innodb row_format = compact;
create table t9 (id int primary key) engine = innodb row_format = redundant;
diff --git a/storage/innobase/mysql-test/innodb.result b/storage/innobase/mysql-test/innodb.result
index bdae7633fd1..d7f4731436b 100644
--- a/storage/innobase/mysql-test/innodb.result
+++ b/storage/innobase/mysql-test/innodb.result
@@ -692,6 +692,9 @@ select count(*) from t1 where sca_pic is null;
count(*)
2
alter table t1 drop index sca_pic, add index sca_pic (cat_code, sca_pic);
+ERROR 42000: Incorrect index name 'sca_pic'
+alter table t1 drop index sca_pic;
+alter table t1 add index sca_pic (cat_code, sca_pic);
select count(*) from t1 where sca_code='PD' and sca_pic is null;
count(*)
1
@@ -699,6 +702,9 @@ select count(*) from t1 where cat_code='E';
count(*)
0
alter table t1 drop index sca_pic, add index (sca_pic, cat_code);
+ERROR 42000: Incorrect index name 'sca_pic'
+alter table t1 drop index sca_pic;
+alter table t1 add index (sca_pic, cat_code);
select count(*) from t1 where sca_code='PD' and sca_pic is null;
count(*)
1
@@ -1833,6 +1839,7 @@ show variables like "innodb_thread_sleep_delay";
Variable_name Value
innodb_thread_sleep_delay 10000
set storage_engine=INNODB;
+set session old_alter_table=1;
drop table if exists t1,t2,t3;
--- Testing varchar ---
--- Testing varchar ---
@@ -1970,7 +1977,7 @@ explain select count(*) from t1 where v between 'a' and 'a ' and v between 'a '
id select_type table type possible_keys key key_len ref rows Extra
1 SIMPLE t1 ref v v 13 const # Using where; Using index
alter table t1 add unique(v);
-ERROR 23000: Duplicate entry 'v' for key 'v_2'
+ERROR 23000: Duplicate entry '{ ' for key 'v_2'
alter table t1 add key(v);
select concat('*',v,'*',c,'*',t,'*') as qq from t1 where v='a';
qq
@@ -2406,6 +2413,7 @@ select * from t1 where a=20 and b is null;
a b
20 NULL
drop table t1;
+set session old_alter_table=0;
create table t1 (v varchar(65530), key(v));
Warnings:
Warning 1071 Specified key was too long; max key length is 767 bytes
@@ -3088,7 +3096,7 @@ ERROR HY000: Lock wait timeout exceeded; try restarting transaction
commit;
drop table t1, t2, t3, t5, t6, t8, t9;
CREATE TABLE t1 (DB_ROW_ID int) engine=innodb;
-ERROR HY000: Can't create table 'test.t1' (errno: -1)
+ERROR 42000: Incorrect column name 'DB_ROW_ID'
CREATE TABLE t1 (
a BIGINT(20) NOT NULL,
PRIMARY KEY (a)
diff --git a/storage/innobase/mysql-test/innodb.test b/storage/innobase/mysql-test/innodb.test
index f46a3a70b56..9f9766acd82 100644
--- a/storage/innobase/mysql-test/innodb.test
+++ b/storage/innobase/mysql-test/innodb.test
@@ -15,6 +15,8 @@
-- source include/have_innodb.inc
+let $MYSQLD_DATADIR= `select @@datadir`;
+
# Save the original values of some variables in order to be able to
# estimate how much they have changed during the tests. Previously this
# test assumed that e.g. rows_deleted is 0 here and after deleting 23
@@ -425,11 +427,19 @@ INSERT INTO t1 ( sca_code, cat_code, sca_desc, lan_code, sca_pic, sca_sdesc, sca
select count(*) from t1 where sca_code = 'PD';
select count(*) from t1 where sca_code <= 'PD';
select count(*) from t1 where sca_pic is null;
+# this should be fixed by MySQL (see Bug #51451)
+--error ER_WRONG_NAME_FOR_INDEX
alter table t1 drop index sca_pic, add index sca_pic (cat_code, sca_pic);
+alter table t1 drop index sca_pic;
+alter table t1 add index sca_pic (cat_code, sca_pic);
select count(*) from t1 where sca_code='PD' and sca_pic is null;
select count(*) from t1 where cat_code='E';
+# this should be fixed by MySQL (see Bug #51451)
+--error ER_WRONG_NAME_FOR_INDEX
alter table t1 drop index sca_pic, add index (sca_pic, cat_code);
+alter table t1 drop index sca_pic;
+alter table t1 add index (sca_pic, cat_code);
select count(*) from t1 where sca_code='PD' and sca_pic is null;
select count(*) from t1 where sca_pic >= 'n';
select sca_pic from t1 where sca_pic is null;
@@ -1375,7 +1385,10 @@ show variables like "innodb_thread_sleep_delay";
let $default=`select @@storage_engine`;
set storage_engine=INNODB;
+# this should be fixed by MySQL (see Bug #51451)
+set session old_alter_table=1;
source include/varchar.inc;
+set session old_alter_table=0;
#
# Some errors/warnings on create
@@ -1700,7 +1713,7 @@ set foreign_key_checks=0;
create table t2 (a varchar(10), foreign key (a) references t1(a)) engine = innodb DEFAULT CHARSET=latin1;
create table t3(a varchar(10) primary key) engine = innodb DEFAULT CHARSET=utf8;
# Embedded server doesn't chdir to data directory
---replace_result $MYSQLTEST_VARDIR . master-data/ ''
+--replace_result $MYSQLD_DATADIR ./ master-data/ ''
-- error 1025
rename table t3 to t1;
set foreign_key_checks=1;
@@ -2264,7 +2277,7 @@ disconnect j;
drop table t1, t2, t3, t5, t6, t8, t9;
# bug 18934, "InnoDB crashes when table uses column names like DB_ROW_ID"
---error 1005
+--error ER_WRONG_COLUMN_NAME
CREATE TABLE t1 (DB_ROW_ID int) engine=innodb;
#
@@ -2340,7 +2353,7 @@ ALTER TABLE t2 ADD FOREIGN KEY (a) REFERENCES t1 (a) ON DELETE SET NULL;
# mysqltest first does replace_regex, then replace_result
--replace_regex /'[^']*test\/#sql-[0-9a-f_]*'/'#sql-temporary'/
# Embedded server doesn't chdir to data directory
---replace_result $MYSQLTEST_VARDIR . master-data/ ''
+--replace_result $MYSQLD_DATADIR ./ master-data/ ''
--error 1025
ALTER TABLE t2 MODIFY a INT NOT NULL;
DELETE FROM t1;
diff --git a/storage/innobase/mysql-test/innodb_bug21704.result b/storage/innobase/mysql-test/innodb_bug21704.result
index b8e0b15d50d..ffbfa8a337e 100644
--- a/storage/innobase/mysql-test/innodb_bug21704.result
+++ b/storage/innobase/mysql-test/innodb_bug21704.result
@@ -25,8 +25,8 @@ ALTER TABLE t1 CHANGE a c INT;
ERROR HY000: Error on rename of '#sql-temporary' to './test/t1' (errno: 150)
# Ensure that online column rename works.
ALTER TABLE t1 CHANGE b c INT;
-affected rows: 0
-info: Records: 0 Duplicates: 0 Warnings: 0
+affected rows: 3
+info: Records: 3 Duplicates: 0 Warnings: 0
# Test renaming the column in the referencing table
@@ -34,8 +34,8 @@ ALTER TABLE t2 CHANGE a c INT;
ERROR HY000: Error on rename of '#sql-temporary' to './test/t2' (errno: 150)
# Ensure that online column rename works.
ALTER TABLE t2 CHANGE b c INT;
-affected rows: 0
-info: Records: 0 Duplicates: 0 Warnings: 0
+affected rows: 3
+info: Records: 3 Duplicates: 0 Warnings: 0
# Test with self-referential constraints
@@ -45,8 +45,8 @@ ALTER TABLE t3 CHANGE b d INT;
ERROR HY000: Error on rename of '#sql-temporary' to './test/t3' (errno: 150)
# Ensure that online column rename works.
ALTER TABLE t3 CHANGE c d INT;
-affected rows: 0
-info: Records: 0 Duplicates: 0 Warnings: 0
+affected rows: 3
+info: Records: 3 Duplicates: 0 Warnings: 0
# Cleanup.
diff --git a/storage/innobase/mysql-test/innodb_bug38231.result b/storage/innobase/mysql-test/innodb_bug38231.result
new file mode 100644
index 00000000000..2f909779755
--- /dev/null
+++ b/storage/innobase/mysql-test/innodb_bug38231.result
@@ -0,0 +1,11 @@
+SET storage_engine=InnoDB;
+INSERT INTO bug38231 VALUES (1), (10), (300);
+SET autocommit=0;
+SELECT * FROM bug38231 FOR UPDATE;
+a
+1
+10
+300
+TRUNCATE TABLE bug38231;
+COMMIT;
+DROP TABLE bug38231;
diff --git a/storage/innobase/mysql-test/innodb_bug38231.test b/storage/innobase/mysql-test/innodb_bug38231.test
new file mode 100644
index 00000000000..1611cb56203
--- /dev/null
+++ b/storage/innobase/mysql-test/innodb_bug38231.test
@@ -0,0 +1,112 @@
+#
+# Bug#38231 Innodb crash in lock_reset_all_on_table() on TRUNCATE + LOCK / UNLOCK
+# http://bugs.mysql.com/38231
+#
+
+-- source include/have_innodb.inc
+
+SET storage_engine=InnoDB;
+
+# we care only that the following SQL commands do not crash the server
+-- disable_query_log
+-- disable_result_log
+
+DROP TABLE IF EXISTS bug38231;
+CREATE TABLE bug38231 (a INT);
+
+-- connect (con1,localhost,root,,)
+-- connect (con2,localhost,root,,)
+-- connect (con3,localhost,root,,)
+
+-- connection con1
+SET autocommit=0;
+LOCK TABLE bug38231 WRITE;
+
+-- connection con2
+SET autocommit=0;
+-- send
+LOCK TABLE bug38231 WRITE;
+
+# When con1 does UNLOCK below this will release either con2 or con3 which are
+# both waiting on LOCK. At the end we must first --reap and UNLOCK the
+# connection that has been released, otherwise it will wait forever. We assume
+# that the released connection will be the first one that has gained the LOCK,
+# thus we force the order here - con2 does LOCK first, then con3. In other
+# words we wait for LOCK from con2 above to be exected before doing LOCK in
+# con3.
+-- connection con1
+let $wait_condition =
+ SELECT COUNT(*) = 1 FROM information_schema.processlist
+ WHERE info = 'LOCK TABLE bug38231 WRITE';
+-- source include/wait_condition.inc
+# the above enables query log, re-disable it
+-- disable_query_log
+
+-- connection con3
+SET autocommit=0;
+-- send
+LOCK TABLE bug38231 WRITE;
+
+-- connection default
+-- send
+TRUNCATE TABLE bug38231;
+
+-- connection con1
+# Wait for TRUNCATE and the other two LOCKs to be executed; without this,
+# sometimes UNLOCK executes before them. We assume there are no other
+# sessions executing at the same time with the same SQL commands.
+let $wait_condition =
+ SELECT COUNT(*) = 1 FROM information_schema.processlist
+ WHERE info = 'TRUNCATE TABLE bug38231';
+-- source include/wait_condition.inc
+let $wait_condition =
+ SELECT COUNT(*) = 2 FROM information_schema.processlist
+ WHERE info = 'LOCK TABLE bug38231 WRITE';
+-- source include/wait_condition.inc
+# the above enables query log, re-disable it
+-- disable_query_log
+
+# this crashes the server if the bug is present
+UNLOCK TABLES;
+
+# clean up
+
+-- connection con2
+-- reap
+UNLOCK TABLES;
+
+-- connection con3
+-- reap
+UNLOCK TABLES;
+
+-- connection default
+-- reap
+
+-- disconnect con1
+-- disconnect con2
+-- disconnect con3
+
+# test that TRUNCATE works with with row-level locks
+
+-- enable_query_log
+-- enable_result_log
+
+INSERT INTO bug38231 VALUES (1), (10), (300);
+
+-- connect (con4,localhost,root,,)
+
+-- connection con4
+SET autocommit=0;
+SELECT * FROM bug38231 FOR UPDATE;
+
+-- connection default
+TRUNCATE TABLE bug38231;
+
+-- connection con4
+COMMIT;
+
+-- connection default
+
+-- disconnect con4
+
+DROP TABLE bug38231;
diff --git a/storage/innobase/mysql-test/innodb_bug39438-master.opt b/storage/innobase/mysql-test/innodb_bug39438-master.opt
new file mode 100644
index 00000000000..43fac202fd4
--- /dev/null
+++ b/storage/innobase/mysql-test/innodb_bug39438-master.opt
@@ -0,0 +1 @@
+--innodb-file-per-table=1
diff --git a/storage/innobase/mysql-test/innodb_bug39438.result b/storage/innobase/mysql-test/innodb_bug39438.result
new file mode 100644
index 00000000000..195775f74c8
--- /dev/null
+++ b/storage/innobase/mysql-test/innodb_bug39438.result
@@ -0,0 +1 @@
+SET storage_engine=InnoDB;
diff --git a/storage/innobase/mysql-test/innodb_bug39438.test b/storage/innobase/mysql-test/innodb_bug39438.test
new file mode 100644
index 00000000000..52302871beb
--- /dev/null
+++ b/storage/innobase/mysql-test/innodb_bug39438.test
@@ -0,0 +1,51 @@
+#
+# Bug#39438 Testcase for Bug#39436 crashes on 5.1 in fil_space_get_latch
+# http://bugs.mysql.com/39438
+#
+# This test must be run with innodb_file_per_table=1 because the crash
+# only occurs if that option is turned on and DISCARD TABLESPACE only
+# works with innodb_file_per_table.
+#
+
+-- source include/have_innodb.inc
+
+SET storage_engine=InnoDB;
+
+# we care only that the following SQL commands do not crash the server
+-- disable_query_log
+-- disable_result_log
+
+DROP TABLE IF EXISTS bug39438;
+
+CREATE TABLE bug39438 (id INT) ENGINE=INNODB;
+
+# remove: XXX Uncomment the following ALTER and remove those lines after
+# remove: applying the patch.
+# remove: Obviously this test is useless without this ALTER command,
+# remove: but it causes warnings to be printed by mysqld and the whole
+# remove: mysql-test suite fails at the end (returns non-zero). Please
+# remove: apply this patch to the mysql source tree, remove those lines
+# remove: and uncomment the following ALTER. We do not care about the
+# remove: warnings, this test is to ensure mysqld does not crash.
+# remove: === modified file 'mysql-test/lib/mtr_report.pl'
+# remove: --- mysql-test/lib/mtr_report.pl 2008-08-12 10:26:23 +0000
+# remove: +++ mysql-test/lib/mtr_report.pl 2008-10-01 11:57:41 +0000
+# remove: @@ -412,7 +412,10 @@
+# remove:
+# remove: # When trying to set lower_case_table_names = 2
+# remove: # on a case sensitive file system. Bug#37402.
+# remove: - /lower_case_table_names was set to 2, even though your the file system '.*' is case sensitive. Now setting lower_case_table_names to 0 to avoid future problems./
+# remove: + /lower_case_table_names was set to 2, even though your the file system '.*' is case sensitive. Now setting lower_case_table_names to 0 to avoid future problems./ or
+# remove: +
+# remove: + # this test is expected to print warnings
+# remove: + ($testname eq 'main.innodb_bug39438')
+# remove: )
+# remove: {
+# remove: next; # Skip these lines
+# remove:
+#ALTER TABLE bug39438 DISCARD TABLESPACE;
+
+# this crashes the server if the bug is present
+SHOW TABLE STATUS;
+
+DROP TABLE bug39438;
diff --git a/storage/innobase/mysql-test/innodb_bug42101-nonzero-master.opt b/storage/innobase/mysql-test/innodb_bug42101-nonzero-master.opt
index d71dbe17d5b..455d66a06b8 100644
--- a/storage/innobase/mysql-test/innodb_bug42101-nonzero-master.opt
+++ b/storage/innobase/mysql-test/innodb_bug42101-nonzero-master.opt
@@ -1 +1 @@
---innodb_commit_concurrency=1
+--loose_innodb_commit_concurrency=1
diff --git a/storage/innobase/mysql-test/innodb_bug44369.result b/storage/innobase/mysql-test/innodb_bug44369.result
index e4b84ecac19..ff25c774aa2 100644
--- a/storage/innobase/mysql-test/innodb_bug44369.result
+++ b/storage/innobase/mysql-test/innodb_bug44369.result
@@ -1,14 +1,6 @@
create table bug44369 (DB_ROW_ID int) engine=innodb;
-ERROR HY000: Can't create table 'test.bug44369' (errno: -1)
+ERROR 42000: Incorrect column name 'DB_ROW_ID'
create table bug44369 (db_row_id int) engine=innodb;
-ERROR HY000: Can't create table 'test.bug44369' (errno: -1)
-show errors;
-Level Code Message
-Error 1005 Error creating table 'test/bug44369' with column name 'db_row_id'. 'db_row_id' is a reserved name. Please try to re-create the table with a different column name.
-Error 1005 Can't create table 'test.bug44369' (errno: -1)
+ERROR 42000: Incorrect column name 'db_row_id'
create table bug44369 (db_TRX_Id int) engine=innodb;
-ERROR HY000: Can't create table 'test.bug44369' (errno: -1)
-show errors;
-Level Code Message
-Error 1005 Error creating table 'test/bug44369' with column name 'db_TRX_Id'. 'db_TRX_Id' is a reserved name. Please try to re-create the table with a different column name.
-Error 1005 Can't create table 'test.bug44369' (errno: -1)
+ERROR 42000: Incorrect column name 'db_TRX_Id'
diff --git a/storage/innobase/mysql-test/innodb_bug44369.test b/storage/innobase/mysql-test/innodb_bug44369.test
index 495059eb5e6..f5d85cd5815 100644
--- a/storage/innobase/mysql-test/innodb_bug44369.test
+++ b/storage/innobase/mysql-test/innodb_bug44369.test
@@ -6,16 +6,12 @@
--source include/have_innodb.inc
# This create table operation should fail.
---error ER_CANT_CREATE_TABLE
+--error ER_WRONG_COLUMN_NAME
create table bug44369 (DB_ROW_ID int) engine=innodb;
# This create should fail as well
---error ER_CANT_CREATE_TABLE
+--error ER_WRONG_COLUMN_NAME
create table bug44369 (db_row_id int) engine=innodb;
-show errors;
-
---error ER_CANT_CREATE_TABLE
+--error ER_WRONG_COLUMN_NAME
create table bug44369 (db_TRX_Id int) engine=innodb;
-
-show errors;
diff --git a/storage/innobase/mysql-test/innodb_bug44571.result b/storage/innobase/mysql-test/innodb_bug44571.result
index 36374edcb3e..7ee7820a02d 100644
--- a/storage/innobase/mysql-test/innodb_bug44571.result
+++ b/storage/innobase/mysql-test/innodb_bug44571.result
@@ -2,8 +2,7 @@ CREATE TABLE bug44571 (foo INT) ENGINE=InnoDB;
ALTER TABLE bug44571 CHANGE foo bar INT;
ALTER TABLE bug44571 ADD INDEX bug44571b (foo);
ERROR 42000: Key column 'foo' doesn't exist in table
-ALTER TABLE bug44571 ADD INDEX bug44571b (bar);
-ERROR HY000: Incorrect key file for table 'bug44571'; try to repair it
-CREATE INDEX bug44571b ON bug44571 (bar);
-ERROR HY000: Incorrect key file for table 'bug44571'; try to repair it
+ALTER TABLE bug44571 ADD INDEX bug44571c (bar);
+DROP INDEX bug44571c ON bug44571;
+CREATE INDEX bug44571c ON bug44571 (bar);
DROP TABLE bug44571;
diff --git a/storage/innobase/mysql-test/innodb_bug44571.test b/storage/innobase/mysql-test/innodb_bug44571.test
index 685463ceff9..91b6722d8af 100644
--- a/storage/innobase/mysql-test/innodb_bug44571.test
+++ b/storage/innobase/mysql-test/innodb_bug44571.test
@@ -1,17 +1,22 @@
#
# Bug#44571 InnoDB Plugin crashes on ADD INDEX
# http://bugs.mysql.com/44571
+# Please also refer to related fix in
+# http://bugs.mysql.com/47621
#
-- source include/have_innodb.inc
CREATE TABLE bug44571 (foo INT) ENGINE=InnoDB;
ALTER TABLE bug44571 CHANGE foo bar INT;
+# Create index with the old column name will fail,
+# because the CHANGE foo bar is successful. And
+# the column name change would communicate to
+# InnoDB with the fix from bug #47621
-- error ER_KEY_COLUMN_DOES_NOT_EXITS
ALTER TABLE bug44571 ADD INDEX bug44571b (foo);
-# The following will fail, because the CHANGE foo bar was
-# not communicated to InnoDB.
---error ER_NOT_KEYFILE
-ALTER TABLE bug44571 ADD INDEX bug44571b (bar);
---error ER_NOT_KEYFILE
-CREATE INDEX bug44571b ON bug44571 (bar);
+# The following create indexes should succeed,
+# indirectly confirm the CHANGE foo bar is successful.
+ALTER TABLE bug44571 ADD INDEX bug44571c (bar);
+DROP INDEX bug44571c ON bug44571;
+CREATE INDEX bug44571c ON bug44571 (bar);
DROP TABLE bug44571;
diff --git a/storage/innobase/mysql-test/innodb_bug46000.result b/storage/innobase/mysql-test/innodb_bug46000.result
index ccff888a48d..c8e3db8d641 100644
--- a/storage/innobase/mysql-test/innodb_bug46000.result
+++ b/storage/innobase/mysql-test/innodb_bug46000.result
@@ -1,17 +1,19 @@
create table bug46000(`id` int,key `GEN_CLUST_INDEX`(`id`))engine=innodb;
-ERROR HY000: Can't create table 'test.bug46000' (errno: -1)
+ERROR 42000: Incorrect index name 'GEN_CLUST_INDEX'
create table bug46000(`id` int, key `GEN_clust_INDEX`(`id`))engine=innodb;
-ERROR HY000: Can't create table 'test.bug46000' (errno: -1)
-show errors;
+ERROR 42000: Incorrect index name 'GEN_CLUST_INDEX'
+show warnings;
Level Code Message
-Error 1005 Cannot Create Index with name 'GEN_CLUST_INDEX'. The name is reserved for the system default primary index.
+Warning 1280 Cannot Create Index with name 'GEN_CLUST_INDEX'. The name is reserved for the system default primary index.
+Error 1280 Incorrect index name 'GEN_CLUST_INDEX'
Error 1005 Can't create table 'test.bug46000' (errno: -1)
create table bug46000(id int) engine=innodb;
create index GEN_CLUST_INDEX on bug46000(id);
-ERROR HY000: Can't create table '#sql-temporary' (errno: -1)
-show errors;
+ERROR 42000: Incorrect index name 'GEN_CLUST_INDEX'
+show warnings;
Level Code Message
-Error 1005 Cannot Create Index with name 'GEN_CLUST_INDEX'. The name is reserved for the system default primary index.
-Error 1005 Can't create table '#sql-temporary' (errno: -1)
+Warning 1280 Cannot Create Index with name 'GEN_CLUST_INDEX'. The name is reserved for the system default primary index.
+Error 1280 Incorrect index name 'GEN_CLUST_INDEX'
+Error 1030 Got error -1 from storage engine
create index idx on bug46000(id);
drop table bug46000;
diff --git a/storage/innobase/mysql-test/innodb_bug46000.test b/storage/innobase/mysql-test/innodb_bug46000.test
index 80c18c58ef0..5a3c666326e 100644
--- a/storage/innobase/mysql-test/innodb_bug46000.test
+++ b/storage/innobase/mysql-test/innodb_bug46000.test
@@ -7,24 +7,22 @@
# This 'create table' operation should fail because of
# using the reserve name as its index name.
---error ER_CANT_CREATE_TABLE
+--error ER_WRONG_NAME_FOR_INDEX
create table bug46000(`id` int,key `GEN_CLUST_INDEX`(`id`))engine=innodb;
# Mixed upper/lower case of the reserved key words
---error ER_CANT_CREATE_TABLE
+--error ER_WRONG_NAME_FOR_INDEX
create table bug46000(`id` int, key `GEN_clust_INDEX`(`id`))engine=innodb;
-show errors;
+show warnings;
create table bug46000(id int) engine=innodb;
# This 'create index' operation should fail.
---replace_regex /'[^']*test.#sql-[0-9a-f_]*'/'#sql-temporary'/
---error ER_CANT_CREATE_TABLE
+--error ER_WRONG_NAME_FOR_INDEX
create index GEN_CLUST_INDEX on bug46000(id);
---replace_regex /'[^']*test.#sql-[0-9a-f_]*'/'#sql-temporary'/
-show errors;
+show warnings;
# This 'create index' operation should succeed, no
# temp table left from last failed create index
diff --git a/storage/innobase/mysql-test/innodb_bug47621.result b/storage/innobase/mysql-test/innodb_bug47621.result
new file mode 100644
index 00000000000..c5f56c09788
--- /dev/null
+++ b/storage/innobase/mysql-test/innodb_bug47621.result
@@ -0,0 +1,21 @@
+CREATE TABLE bug47621 (salesperson INT) ENGINE=InnoDB;
+ALTER TABLE bug47621 CHANGE salesperson sales_acct_id INT;
+create index orgs on bug47621(sales_acct_id);
+ALTER TABLE bug47621 CHANGE sales_acct_id salesperson INT;
+drop table bug47621;
+CREATE TABLE bug47621_sale (
+salesperson INT,
+PRIMARY KEY(salesperson)) engine = innodb;
+CREATE TABLE bug47621_shirt(
+id SMALLINT,
+owner INT,
+FOREIGN KEY(owner)
+references bug47621_sale(salesperson) ON DELETE RESTRICT)
+engine = innodb;
+insert into bug47621_sale values(9);
+insert into bug47621_shirt values(1, 9);
+ALTER TABLE bug47621_shirt CHANGE id new_id INT;
+drop table bug47621_shirt;
+ALTER TABLE bug47621_sale CHANGE salesperson sales_acct_id INT;
+ALTER TABLE bug47621_sale ADD INDEX idx (sales_acct_id);
+drop table bug47621_sale;
diff --git a/storage/innobase/mysql-test/innodb_bug47621.test b/storage/innobase/mysql-test/innodb_bug47621.test
new file mode 100644
index 00000000000..4863cc6bba1
--- /dev/null
+++ b/storage/innobase/mysql-test/innodb_bug47621.test
@@ -0,0 +1,57 @@
+# This is the test for bug #47621, column rename operation should
+# not result in column definition inconsistency between MySQL and
+# InnoDB
+
+--source include/have_innodb.inc
+
+CREATE TABLE bug47621 (salesperson INT) ENGINE=InnoDB;
+
+# Change the column name
+ALTER TABLE bug47621 CHANGE salesperson sales_acct_id INT;
+
+# If there is inconsistency of column name definition
+# in MySQL or InnoDB, following create index would fail
+create index orgs on bug47621(sales_acct_id);
+
+# Change the column name back with the index defined on it.
+ALTER TABLE bug47621 CHANGE sales_acct_id salesperson INT;
+
+drop table bug47621;
+
+CREATE TABLE bug47621_sale (
+ salesperson INT,
+ PRIMARY KEY(salesperson)) engine = innodb;
+
+CREATE TABLE bug47621_shirt(
+ id SMALLINT,
+ owner INT,
+ FOREIGN KEY(owner)
+ references bug47621_sale(salesperson) ON DELETE RESTRICT)
+ engine = innodb;
+
+insert into bug47621_sale values(9);
+
+insert into bug47621_shirt values(1, 9);
+
+# Any rename operation on columns involved in a reference constraint will
+# fail, as it will be rejected by InnoDB row_rename_table_for_mysql().
+# In above example, any rename on column "salesperson" for table
+# "bug47621_sale", or on column "owner" for table "bug47621_shirt will
+# be blocked. We do not put such rename in the test since InnoDB error
+# message will be printed in the error log, and result in test failure.
+#
+# ALTER TABLE bug47621_sale CHANGE salesperson sales_acct_id INT;
+
+# Any rename on columns not involved in the foreign key constraint
+# could still proceed
+ALTER TABLE bug47621_shirt CHANGE id new_id INT;
+
+# Referencing table dropped, the rename operation on related columns
+# could proceed
+drop table bug47621_shirt;
+
+ALTER TABLE bug47621_sale CHANGE salesperson sales_acct_id INT;
+
+ALTER TABLE bug47621_sale ADD INDEX idx (sales_acct_id);
+
+drop table bug47621_sale;
diff --git a/storage/innobase/mysql-test/innodb_bug47622.result b/storage/innobase/mysql-test/innodb_bug47622.result
new file mode 100644
index 00000000000..f5d13711c52
--- /dev/null
+++ b/storage/innobase/mysql-test/innodb_bug47622.result
@@ -0,0 +1,23 @@
+CREATE TABLE bug47622(
+`rule_key` int(11) NOT NULL DEFAULT '0',
+`seq` smallint(6) NOT NULL DEFAULT '0',
+`action` smallint(6) NOT NULL DEFAULT '0',
+`arg_id` smallint(6) DEFAULT NULL,
+`else_ind` TINYINT NOT NULL,
+KEY IDX_A (`arg_id`)
+) ENGINE=InnoDB;
+ALTER TABLE bug47622 ADD UNIQUE IDX_B (rule_key,else_ind,seq,action,arg_id);
+drop index IDX_B on bug47622;
+create index idx on bug47622(seq, arg_id);
+ALTER TABLE bug47622 ADD UNIQUE IDX_X (rule_key,else_ind,seq,action);
+drop table bug47622;
+CREATE TABLE bug47622 (
+`a` int(11) NOT NULL,
+`b` int(11) DEFAULT NULL,
+`c` char(10) DEFAULT NULL,
+`d` varchar(20) DEFAULT NULL,
+PRIMARY KEY (`a`),
+KEY `b` (`b`)
+) ENGINE=InnoDB;
+alter table bug47622 add unique index (c), add index (d);
+drop table bug47622;
diff --git a/storage/innobase/mysql-test/innodb_bug47622.test b/storage/innobase/mysql-test/innodb_bug47622.test
new file mode 100644
index 00000000000..9cf9d0e531b
--- /dev/null
+++ b/storage/innobase/mysql-test/innodb_bug47622.test
@@ -0,0 +1,55 @@
+# This is the test for bug 47622. There could be index
+# metadata sequence mismatch between MySQL and Innodb
+# after creating index through FIC interfaces.
+# We resolve the problem by sync the index sequence
+# up when opening the table.
+
+--source include/have_innodb.inc
+
+connect (a,localhost,root,,);
+connect (b,localhost,root,,);
+
+# Create a table with a non-unique index
+CREATE TABLE bug47622(
+ `rule_key` int(11) NOT NULL DEFAULT '0',
+ `seq` smallint(6) NOT NULL DEFAULT '0',
+ `action` smallint(6) NOT NULL DEFAULT '0',
+ `arg_id` smallint(6) DEFAULT NULL,
+ `else_ind` TINYINT NOT NULL,
+ KEY IDX_A (`arg_id`)
+) ENGINE=InnoDB;
+
+connection a;
+
+# A subsequent creating unique index should not trigger
+# any error message. Unique index would be ranked ahead
+# of regular index.
+ALTER TABLE bug47622 ADD UNIQUE IDX_B (rule_key,else_ind,seq,action,arg_id);
+
+drop index IDX_B on bug47622;
+
+# In another connection, create additional set of normal
+# index and unique index. Again, unique index would be ranked
+# ahead of regular index.
+connection b;
+create index idx on bug47622(seq, arg_id);
+
+ALTER TABLE bug47622 ADD UNIQUE IDX_X (rule_key,else_ind,seq,action);
+
+drop table bug47622;
+
+# Create a table with one Primary key and a non-unique key
+CREATE TABLE bug47622 (
+ `a` int(11) NOT NULL,
+ `b` int(11) DEFAULT NULL,
+ `c` char(10) DEFAULT NULL,
+ `d` varchar(20) DEFAULT NULL,
+ PRIMARY KEY (`a`),
+ KEY `b` (`b`)
+) ENGINE=InnoDB;
+
+# Add two index with one unique and one non-unique.
+# Index sequence is "PRIMARY", "c", "b" and "d"
+alter table bug47622 add unique index (c), add index (d);
+
+drop table bug47622;
diff --git a/storage/innobase/mysql-test/innodb_bug47777.result b/storage/innobase/mysql-test/innodb_bug47777.result
new file mode 100644
index 00000000000..fbba47edcfc
--- /dev/null
+++ b/storage/innobase/mysql-test/innodb_bug47777.result
@@ -0,0 +1,13 @@
+create table bug47777(c2 linestring not null, primary key (c2(1))) engine=innodb;
+insert into bug47777 values (geomfromtext('linestring(1 2,3 4,5 6,7 8,9 10)'));
+select count(*) from bug47777 where c2 =geomfromtext('linestring(1 2,3 4,5 6,7 8,9 10)');
+count(*)
+1
+update bug47777 set c2=GeomFromText('POINT(1 1)');
+select count(*) from bug47777 where c2 =geomfromtext('linestring(1 2,3 4,5 6,7 8,9 10)');
+count(*)
+0
+select count(*) from bug47777 where c2 = GeomFromText('POINT(1 1)');
+count(*)
+1
+drop table bug47777;
diff --git a/storage/innobase/mysql-test/innodb_bug47777.test b/storage/innobase/mysql-test/innodb_bug47777.test
new file mode 100644
index 00000000000..8f2985b2cf0
--- /dev/null
+++ b/storage/innobase/mysql-test/innodb_bug47777.test
@@ -0,0 +1,24 @@
+# This is the test for bug 47777. GEOMETRY
+# data is treated as BLOB data in innodb.
+# Consequently, its key value generation/storing
+# should follow the process for the BLOB
+# datatype as well.
+
+--source include/have_innodb.inc
+
+create table bug47777(c2 linestring not null, primary key (c2(1))) engine=innodb;
+
+insert into bug47777 values (geomfromtext('linestring(1 2,3 4,5 6,7 8,9 10)'));
+
+# Verify correct row get inserted.
+select count(*) from bug47777 where c2 =geomfromtext('linestring(1 2,3 4,5 6,7 8,9 10)');
+
+# Update table bug47777 should be successful.
+update bug47777 set c2=GeomFromText('POINT(1 1)');
+
+# Verify the row get updated successfully. The original
+# c2 value should be changed to GeomFromText('POINT(1 1)').
+select count(*) from bug47777 where c2 =geomfromtext('linestring(1 2,3 4,5 6,7 8,9 10)');
+select count(*) from bug47777 where c2 = GeomFromText('POINT(1 1)');
+
+drop table bug47777;
diff --git a/storage/innobase/mysql-test/innodb_bug51378.result b/storage/innobase/mysql-test/innodb_bug51378.result
new file mode 100644
index 00000000000..a3ca73c16a9
--- /dev/null
+++ b/storage/innobase/mysql-test/innodb_bug51378.result
@@ -0,0 +1,66 @@
+create table bug51378 (
+col1 int not null,
+col2 blob not null,
+col3 time not null) engine = innodb;
+create unique index idx on bug51378(col1, col2(31));
+alter table bug51378 add unique index idx2(col1, col2(31));
+create unique index idx3 on bug51378(col1, col3);
+SHOW CREATE TABLE bug51378;
+Table Create Table
+bug51378 CREATE TABLE `bug51378` (
+ `col1` int(11) NOT NULL,
+ `col2` blob NOT NULL,
+ `col3` time NOT NULL,
+ UNIQUE KEY `idx3` (`col1`,`col3`),
+ UNIQUE KEY `idx` (`col1`,`col2`(31)),
+ UNIQUE KEY `idx2` (`col1`,`col2`(31))
+) ENGINE=InnoDB DEFAULT CHARSET=latin1
+drop index idx3 on bug51378;
+SHOW CREATE TABLE bug51378;
+Table Create Table
+bug51378 CREATE TABLE `bug51378` (
+ `col1` int(11) NOT NULL,
+ `col2` blob NOT NULL,
+ `col3` time NOT NULL,
+ UNIQUE KEY `idx` (`col1`,`col2`(31)),
+ UNIQUE KEY `idx2` (`col1`,`col2`(31))
+) ENGINE=InnoDB DEFAULT CHARSET=latin1
+alter table bug51378 add primary key idx3(col1, col2(31));
+SHOW CREATE TABLE bug51378;
+Table Create Table
+bug51378 CREATE TABLE `bug51378` (
+ `col1` int(11) NOT NULL,
+ `col2` blob NOT NULL,
+ `col3` time NOT NULL,
+ PRIMARY KEY (`col1`,`col2`(31)),
+ UNIQUE KEY `idx` (`col1`,`col2`(31)),
+ UNIQUE KEY `idx2` (`col1`,`col2`(31))
+) ENGINE=InnoDB DEFAULT CHARSET=latin1
+drop table bug51378;
+create table bug51378 (
+col1 int not null,
+col2 blob not null,
+col3 time not null, primary key(col1, col2(31))) engine = innodb;
+create unique index idx on bug51378(col1, col2(31));
+SHOW CREATE TABLE bug51378;
+Table Create Table
+bug51378 CREATE TABLE `bug51378` (
+ `col1` int(11) NOT NULL,
+ `col2` blob NOT NULL,
+ `col3` time NOT NULL,
+ PRIMARY KEY (`col1`,`col2`(31)),
+ UNIQUE KEY `idx` (`col1`,`col2`(31))
+) ENGINE=InnoDB DEFAULT CHARSET=latin1
+drop table bug51378;
+create table bug51378 (
+col1 int not null,
+col2 int ) engine = innodb;
+create unique index idx on bug51378(col1, col2);
+SHOW CREATE TABLE bug51378;
+Table Create Table
+bug51378 CREATE TABLE `bug51378` (
+ `col1` int(11) NOT NULL,
+ `col2` int(11) DEFAULT NULL,
+ UNIQUE KEY `idx` (`col1`,`col2`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1
+drop table bug51378;
diff --git a/storage/innobase/mysql-test/innodb_bug51378.test b/storage/innobase/mysql-test/innodb_bug51378.test
new file mode 100644
index 00000000000..8f7b0b9605a
--- /dev/null
+++ b/storage/innobase/mysql-test/innodb_bug51378.test
@@ -0,0 +1,77 @@
+# This is the test for bug 51378. Unique index created
+# through "create index" and "alter table add unique index"
+# interfaces should not be treated as primary index if indexed
+# columns contain one or more column prefix(es) (only prefix/part of
+# the column is indexed)
+# On the other hand, if there is a unique index covers all
+# columns of a table, and they are non-null columns, and
+# full length of the column are indexed, then this index
+# will be created as primary index
+# Following queries test various scenario, no mismatch
+# error message should be printed.
+--source include/have_innodb.inc
+
+# Create a table contains a BLOB column
+create table bug51378 (
+ col1 int not null,
+ col2 blob not null,
+ col3 time not null) engine = innodb;
+
+# Create following unique indexes on 'col1' and 'col2(31)'
+# of the table, the index should not be treated as primary
+# key because it indexes only first 31 bytes of col2.
+# Thus it contains "column prefix", and will not be
+# upgraded to primary index.
+# There should not be mismatch message printed in the
+# errorlog
+create unique index idx on bug51378(col1, col2(31));
+
+alter table bug51378 add unique index idx2(col1, col2(31));
+
+# Unique index on 'col1' and 'col3' will be created as primary index,
+# since the index does not contain column prefix
+create unique index idx3 on bug51378(col1, col3);
+
+# Show create table would show idx3 created as unique index, internally,
+# idx3 is treated as primary index both by MySQL and Innodb
+SHOW CREATE TABLE bug51378;
+
+# "GEN_CLUST_INDEX" will be re-created as default primary index
+# after idx3 is dropped
+drop index idx3 on bug51378;
+
+SHOW CREATE TABLE bug51378;
+
+# Or we can add the primary key through alter table interfaces
+alter table bug51378 add primary key idx3(col1, col2(31));
+
+SHOW CREATE TABLE bug51378;
+
+drop table bug51378;
+
+# Or we can create such primary key through create table interfaces
+create table bug51378 (
+ col1 int not null,
+ col2 blob not null,
+ col3 time not null, primary key(col1, col2(31))) engine = innodb;
+
+# Unique index on one or more column prefix(es) will be created
+# as non-cluster index
+create unique index idx on bug51378(col1, col2(31));
+
+SHOW CREATE TABLE bug51378;
+
+drop table bug51378;
+
+# If a table has a NULLABLE column, unique index on it will not
+# be treated as primary index.
+create table bug51378 (
+ col1 int not null,
+ col2 int ) engine = innodb;
+
+# This will be created as non-cluster index since col2 is nullable
+create unique index idx on bug51378(col1, col2);
+
+SHOW CREATE TABLE bug51378;
+
+drop table bug51378;
diff --git a/storage/innobase/mysql-test/innodb_bug51920.result b/storage/innobase/mysql-test/innodb_bug51920.result
new file mode 100644
index 00000000000..4c2ec3e01e5
--- /dev/null
+++ b/storage/innobase/mysql-test/innodb_bug51920.result
@@ -0,0 +1,13 @@
+CREATE TABLE bug51920 (i INT) ENGINE=InnoDB;
+INSERT INTO bug51920 VALUES (1);
+BEGIN;
+SELECT * FROM bug51920 FOR UPDATE;
+i
+1
+UPDATE bug51920 SET i=2;
+SELECT ID FROM INFORMATION_SCHEMA.PROCESSLIST
+WHERE INFO="UPDATE bug51920 SET i=2"
+INTO @thread_id;
+KILL @thread_id;
+ERROR 70100: Query execution was interrupted
+DROP TABLE bug51920;
diff --git a/storage/innobase/mysql-test/innodb_bug51920.test b/storage/innobase/mysql-test/innodb_bug51920.test
new file mode 100644
index 00000000000..05c884134be
--- /dev/null
+++ b/storage/innobase/mysql-test/innodb_bug51920.test
@@ -0,0 +1,39 @@
+#
+# Bug #51920: InnoDB connections in lock wait ignore KILL until timeout
+#
+-- source include/not_embedded.inc
+-- source include/have_innodb.inc
+
+CREATE TABLE bug51920 (i INT) ENGINE=InnoDB;
+INSERT INTO bug51920 VALUES (1);
+
+BEGIN;
+SELECT * FROM bug51920 FOR UPDATE;
+
+connect (con1,localhost,root,,);
+
+connection con1;
+--send
+UPDATE bug51920 SET i=2;
+
+connection default;
+let $wait_condition =
+ SELECT COUNT(*)=1 FROM information_schema.processlist
+ WHERE INFO="UPDATE bug51920 SET i=2";
+-- source include/wait_condition.inc
+
+SELECT ID FROM INFORMATION_SCHEMA.PROCESSLIST
+WHERE INFO="UPDATE bug51920 SET i=2"
+INTO @thread_id;
+
+KILL @thread_id;
+let $wait_condition =
+ SELECT COUNT(*)=0 FROM information_schema.processlist WHERE ID=@thread_id;
+-- source include/wait_condition.inc
+
+connection con1;
+-- error ER_QUERY_INTERRUPTED
+reap;
+connection default;
+DROP TABLE bug51920;
+-- disconnect con1
diff --git a/storage/innobase/mysql-test/innodb_file_format.result b/storage/innobase/mysql-test/innodb_file_format.result
index 8e9a317308b..86d60706084 100644
--- a/storage/innobase/mysql-test/innodb_file_format.result
+++ b/storage/innobase/mysql-test/innodb_file_format.result
@@ -30,8 +30,6 @@ select @@innodb_file_format_check;
@@innodb_file_format_check
Barracuda
set global innodb_file_format_check=default;
-Warnings:
-Warning 1210 Ignoring SET innodb_file_format=on
select @@innodb_file_format_check;
@@innodb_file_format_check
Barracuda
diff --git a/storage/innobase/mysql-test/innodb_information_schema.test b/storage/innobase/mysql-test/innodb_information_schema.test
index eaed653854a..fc1d38d8d14 100644
--- a/storage/innobase/mysql-test/innodb_information_schema.test
+++ b/storage/innobase/mysql-test/innodb_information_schema.test
@@ -109,14 +109,18 @@ SELECT * FROM ```t'\"_str` WHERE c1 = '3' FOR UPDATE;
-- send
SELECT * FROM ```t'\"_str` WHERE c1 = '4' FOR UPDATE;
-# Give time to the above 2 queries to execute before continuing.
-# Without this sleep it sometimes happens that the SELECT from innodb_locks
-# executes before some of them, resulting in less than expected number
-# of rows being selected from innodb_locks.
--- sleep 0.1
-
-- enable_result_log
-- connection con_verify_innodb_locks
+# Wait for the above queries to execute before continuing.
+# Without this, it sometimes happens that the SELECT from innodb_locks
+# executes before some of them, resulting in less than expected number
+# of rows being selected from innodb_locks. If there is a bug and there
+# are no 14 rows in innodb_locks then this test will fail with timeout.
+let $count = 14;
+let $table = INFORMATION_SCHEMA.INNODB_LOCKS;
+-- source include/wait_until_rows_count.inc
+# the above enables the query log, re-disable it
+-- disable_query_log
SELECT lock_mode, lock_type, lock_table, lock_index, lock_rec, lock_data
FROM INFORMATION_SCHEMA.INNODB_LOCKS ORDER BY lock_data;
diff --git a/storage/innobase/mysql-test/patches/innodb-index.diff b/storage/innobase/mysql-test/patches/innodb-index.diff
deleted file mode 100644
index 0b008c96f25..00000000000
--- a/storage/innobase/mysql-test/patches/innodb-index.diff
+++ /dev/null
@@ -1,62 +0,0 @@
-This part of the innodb-index test causes mysqld to print some warnings
-and subsequently the whole mysql-test suite to fail.
-
-A permanent solution is probably to remove the printouts from the source
-code or to somehow tell the mysql-test suite that warnings are expected.
-Currently we simply do not execute the problematic tests. Please
-coordinate a permanent solution with Marko, who added those tests.
-
-This cannot be proposed to MySQL because it touches files that are not
-in the MySQL source repository.
-
-Index: storage/innobase/mysql-test/innodb-index.result
-===================================================================
---- storage/innobase/mysql-test/innodb-index.result (revision 2870)
-+++ storage/innobase/mysql-test/innodb-index.result (working copy)
-@@ -43,19 +43,12 @@ t1 CREATE TABLE `t1` (
- `b` int(11) DEFAULT NULL,
- `c` char(10) NOT NULL,
- `d` varchar(20) DEFAULT NULL,
- KEY `d2` (`d`),
- KEY `b` (`b`)
- ) ENGINE=InnoDB DEFAULT CHARSET=latin1
--CREATE TABLE `t1#1`(a INT PRIMARY KEY) ENGINE=InnoDB;
--alter table t1 add unique index (c), add index (d);
--ERROR HY000: Table 'test.t1#1' already exists
--rename table `t1#1` to `t1#2`;
--alter table t1 add unique index (c), add index (d);
--ERROR HY000: Table 'test.t1#2' already exists
--drop table `t1#2`;
- alter table t1 add unique index (c), add index (d);
- show create table t1;
- Table Create Table
- t1 CREATE TABLE `t1` (
- `a` int(11) NOT NULL,
- `b` int(11) DEFAULT NULL,
-Index: storage/innobase/mysql-test/innodb-index.test
-===================================================================
---- storage/innobase/mysql-test/innodb-index.test (revision 2870)
-+++ storage/innobase/mysql-test/innodb-index.test (working copy)
-@@ -14,22 +14,12 @@ select * from t1 force index (d2) order
- --error ER_DUP_ENTRY
- alter table t1 add unique index (b);
- show create table t1;
- alter table t1 add index (b);
- show create table t1;
-
--# Check how existing tables interfere with temporary tables.
--CREATE TABLE `t1#1`(a INT PRIMARY KEY) ENGINE=InnoDB;
--
----error 156
--alter table t1 add unique index (c), add index (d);
--rename table `t1#1` to `t1#2`;
----error 156
--alter table t1 add unique index (c), add index (d);
--drop table `t1#2`;
--
- alter table t1 add unique index (c), add index (d);
- show create table t1;
- explain select * from t1 force index(c) order by c;
- alter table t1 add primary key (a), drop index c;
- show create table t1;
- --error ER_MULTIPLE_PRI_KEY
diff --git a/storage/innobase/mysql-test/patches/innodb_change_buffering_basic.diff b/storage/innobase/mysql-test/patches/innodb_change_buffering_basic.diff
new file mode 100644
index 00000000000..bfa1609a97c
--- /dev/null
+++ b/storage/innobase/mysql-test/patches/innodb_change_buffering_basic.diff
@@ -0,0 +1,60 @@
+--- mysql-test/suite/sys_vars/t/innodb_change_buffering_basic.test.orig Mon Mar 15 16:15:22 2010
++++ mysql-test/suite/sys_vars/t/innodb_change_buffering_basic.test Fri Mar 19 01:19:09 2010
+@@ -11,8 +11,8 @@
+ #
+ # exists as global only
+ #
+---echo Valid values are 'inserts' and 'none'
+-select @@global.innodb_change_buffering in ('inserts', 'none');
++--echo Valid values are 'inserts', 'deletes', 'changes', 'purges', 'all', and 'none'
++select @@global.innodb_change_buffering in ('inserts', 'deletes', 'changes', 'purges', 'all', 'none');
+ select @@global.innodb_change_buffering;
+ --error ER_INCORRECT_GLOBAL_LOCAL_VAR
+ select @@session.innodb_change_buffering;
+
+--- mysql-test/suite/sys_vars/r/innodb_change_buffering_basic.result.orig Mon Mar 15 16:15:22 2010
++++ mysql-test/suite/sys_vars/r/innodb_change_buffering_basic.result Fri Mar 19 01:23:58 2010
+@@ -1,28 +1,28 @@
+ SET @start_global_value = @@global.innodb_change_buffering;
+ SELECT @start_global_value;
+ @start_global_value
+-inserts
+-Valid values are 'inserts' and 'none'
+-select @@global.innodb_change_buffering in ('inserts', 'none');
+-@@global.innodb_change_buffering in ('inserts', 'none')
++all
++Valid values are 'inserts', 'deletes', 'changes', 'purges', 'all', and 'none'
++select @@global.innodb_change_buffering in ('inserts', 'deletes', 'changes', 'purges', 'all', 'none');
++@@global.innodb_change_buffering in ('inserts', 'deletes', 'changes', 'purges', 'all', 'none')
+ 1
+ select @@global.innodb_change_buffering;
+ @@global.innodb_change_buffering
+-inserts
++all
+ select @@session.innodb_change_buffering;
+ ERROR HY000: Variable 'innodb_change_buffering' is a GLOBAL variable
+ show global variables like 'innodb_change_buffering';
+ Variable_name Value
+-innodb_change_buffering inserts
++innodb_change_buffering all
+ show session variables like 'innodb_change_buffering';
+ Variable_name Value
+-innodb_change_buffering inserts
++innodb_change_buffering all
+ select * from information_schema.global_variables where variable_name='innodb_change_buffering';
+ VARIABLE_NAME VARIABLE_VALUE
+-INNODB_CHANGE_BUFFERING inserts
++INNODB_CHANGE_BUFFERING all
+ select * from information_schema.session_variables where variable_name='innodb_change_buffering';
+ VARIABLE_NAME VARIABLE_VALUE
+-INNODB_CHANGE_BUFFERING inserts
++INNODB_CHANGE_BUFFERING all
+ set global innodb_change_buffering='none';
+ select @@global.innodb_change_buffering;
+ @@global.innodb_change_buffering
+@@ -60,4 +60,4 @@
+ SET @@global.innodb_change_buffering = @start_global_value;
+ SELECT @@global.innodb_change_buffering;
+ @@global.innodb_change_buffering
+-inserts
++all
diff --git a/storage/innobase/os/os0file.c b/storage/innobase/os/os0file.c
index 37edad442db..c0551f8bd63 100644
--- a/storage/innobase/os/os0file.c
+++ b/storage/innobase/os/os0file.c
@@ -1,23 +1,6 @@
-/*****************************************************************************
-
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
-
-This program is free software; you can redistribute it and/or modify it under
-the terms of the GNU General Public License as published by the Free Software
-Foundation; version 2 of the License.
-
-This program is distributed in the hope that it will be useful, but WITHOUT
-ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
-
-*****************************************************************************/
/***********************************************************************
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
Copyright (c) 2009, Percona Inc.
Portions of this file contain modifications contributed and copyrighted
@@ -50,6 +33,11 @@ Created 10/21/1995 Heikki Tuuri
*******************************************************/
#include "os0file.h"
+
+#ifdef UNIV_NONINL
+#include "os0file.ic"
+#endif
+
#include "ut0mem.h"
#include "srv0srv.h"
#include "srv0start.h"
@@ -67,6 +55,10 @@ Created 10/21/1995 Heikki Tuuri
# endif /* __WIN__ */
#endif /* !UNIV_HOTBACKUP */
+#if defined(LINUX_NATIVE_AIO)
+#include <libaio.h>
+#endif
+
/* This specifies the file permissions InnoDB uses when it creates files in
Unix; the value of os_innodb_umask is initialized in ha_innodb.cc to
my_umask */
@@ -88,9 +80,7 @@ UNIV_INTERN ibool os_do_not_call_flush_at_each_write = FALSE;
/* We do not call os_file_flush in every os_file_write. */
#endif /* UNIV_DO_FLUSH */
-#ifdef UNIV_HOTBACKUP
-# define os_aio_use_native_aio FALSE
-#else /* UNIV_HOTBACKUP */
+#ifndef UNIV_HOTBACKUP
/* We use these mutexes to protect lseek + file i/o operation, if the
OS does not provide an atomic pread or pwrite, or similar */
#define OS_FILE_N_SEEK_MUTEXES 16
@@ -99,15 +89,70 @@ UNIV_INTERN os_mutex_t os_file_seek_mutexes[OS_FILE_N_SEEK_MUTEXES];
/* In simulated aio, merge at most this many consecutive i/os */
#define OS_AIO_MERGE_N_CONSECUTIVE 64
-/** If this flag is TRUE, then we will use the native aio of the
-OS (provided we compiled Innobase with it in), otherwise we will
-use simulated aio we build below with threads */
-
-UNIV_INTERN ibool os_aio_use_native_aio = FALSE;
+/**********************************************************************
+
+InnoDB AIO Implementation:
+=========================
+
+We support native AIO for windows and linux. For rest of the platforms
+we simulate AIO by special io-threads servicing the IO-requests.
+
+Simulated AIO:
+==============
+
+In platforms where we 'simulate' AIO following is a rough explanation
+of the high level design.
+There are four io-threads (for ibuf, log, read, write).
+All synchronous IO requests are serviced by the calling thread using
+os_file_write/os_file_read. The Asynchronous requests are queued up
+in an array (there are four such arrays) by the calling thread.
+Later these requests are picked up by the io-thread and are serviced
+synchronously.
+
+Windows native AIO:
+==================
+
+If srv_use_native_aio is not set then windows follow the same
+code as simulated AIO. If the flag is set then native AIO interface
+is used. On windows, one of the limitation is that if a file is opened
+for AIO no synchronous IO can be done on it. Therefore we have an
+extra fifth array to queue up synchronous IO requests.
+There are innodb_file_io_threads helper threads. These threads work
+on the four arrays mentioned above in Simulated AIO. No thread is
+required for the sync array.
+If a synchronous IO request is made, it is first queued in the sync
+array. Then the calling thread itself waits on the request, thus
+making the call synchronous.
+If an AIO request is made the calling thread not only queues it in the
+array but also submits the requests. The helper thread then collects
+the completed IO request and calls completion routine on it.
+
+Linux native AIO:
+=================
+
+If we have libaio installed on the system and innodb_use_native_aio
+is set to TRUE we follow the code path of native AIO, otherwise we
+do simulated AIO.
+There are innodb_file_io_threads helper threads. These threads work
+on the four arrays mentioned above in Simulated AIO.
+If a synchronous IO request is made, it is handled by calling
+os_file_write/os_file_read.
+If an AIO request is made the calling thread not only queues it in the
+array but also submits the requests. The helper thread then collects
+the completed IO request and calls completion routine on it.
+
+**********************************************************************/
/** Flag: enable debug printout for asynchronous i/o */
UNIV_INTERN ibool os_aio_print_debug = FALSE;
+#ifdef UNIV_PFS_IO
+/* Keys to register InnoDB I/O with performance schema */
+UNIV_INTERN mysql_pfs_key_t innodb_file_data_key;
+UNIV_INTERN mysql_pfs_key_t innodb_file_log_key;
+UNIV_INTERN mysql_pfs_key_t innodb_file_temp_key;
+#endif /* UNIV_PFS_IO */
+
/** The asynchronous i/o array slot structure */
typedef struct os_aio_slot_struct os_aio_slot_t;
@@ -142,6 +187,10 @@ struct os_aio_slot_struct{
OVERLAPPED struct */
OVERLAPPED control; /*!< Windows control block for the
aio request */
+#elif defined(LINUX_NATIVE_AIO)
+ struct iocb control; /* Linux control block for aio */
+ int n_bytes; /* bytes written/read. */
+ int ret; /* AIO return code */
#endif
};
@@ -167,6 +216,10 @@ struct os_aio_array_struct{
array of pending aio requests. A
thread can wait separately for any one
of the segments. */
+ ulint cur_seg;/*!< We reserve IO requests in round
+ robin fashion to different segments.
+ This points to the segment that is to
+ be used to service next IO request. */
ulint n_reserved;
/*!< Number of reserved slots in the
aio array outside the ibuf segment */
@@ -180,8 +233,31 @@ struct os_aio_array_struct{
WaitForMultipleObjects; used only in
Windows */
#endif
+
+#if defined(LINUX_NATIVE_AIO)
+ io_context_t* aio_ctx;
+ /* completion queue for IO. There is
+ one such queue per segment. Each thread
+ will work on one ctx exclusively. */
+ struct io_event* aio_events;
+ /* The array to collect completed IOs.
+ There is one such event for each
+ possible pending IO. The size of the
+ array is equal to n_slots. */
+#endif
};
+#if defined(LINUX_NATIVE_AIO)
+/** timeout for each io_getevents() call = 500ms. */
+#define OS_AIO_REAP_TIMEOUT (500000000UL)
+
+/** time to sleep, in microseconds if io_setup() returns EAGAIN. */
+#define OS_AIO_IO_SETUP_RETRY_SLEEP (500000UL)
+
+/** number of attempts before giving up on io_setup(). */
+#define OS_AIO_IO_SETUP_RETRY_ATTEMPTS 5
+#endif
+
/** Array of events used in simulated aio */
static os_event_t* os_aio_segment_wait_events = NULL;
@@ -200,7 +276,7 @@ static ulint os_aio_n_segments = ULINT_UNDEFINED;
/** If the following is TRUE, read i/o handler threads try to
wait until a batch of new read requests have been posted */
static ibool os_aio_recommend_sleep_for_read_threads = FALSE;
-#endif /* UNIV_HOTBACKUP */
+#endif /* !UNIV_HOTBACKUP */
UNIV_INTERN ulint os_n_file_reads = 0;
UNIV_INTERN ulint os_bytes_read_since_printout = 0;
@@ -406,17 +482,29 @@ os_file_get_last_error(
fflush(stderr);
- if (err == ENOSPC) {
+ switch (err) {
+ case ENOSPC:
return(OS_FILE_DISK_FULL);
- } else if (err == ENOENT) {
+ case ENOENT:
return(OS_FILE_NOT_FOUND);
- } else if (err == EEXIST) {
+ case EEXIST:
return(OS_FILE_ALREADY_EXISTS);
- } else if (err == EXDEV || err == ENOTDIR || err == EISDIR) {
+ case EXDEV:
+ case ENOTDIR:
+ case EISDIR:
return(OS_FILE_PATH_ERROR);
- } else {
- return(100 + err);
+ case EAGAIN:
+ if (srv_use_native_aio) {
+ return(OS_FILE_AIO_RESOURCES_RESERVED);
+ }
+ break;
+ case EINTR:
+ if (srv_use_native_aio) {
+ return(OS_FILE_AIO_INTERRUPTED);
+ }
+ break;
}
+ return(100 + err);
#endif
}
@@ -466,6 +554,9 @@ os_file_handle_error_cond_exit(
} else if (err == OS_FILE_AIO_RESOURCES_RESERVED) {
return(TRUE);
+ } else if (err == OS_FILE_AIO_INTERRUPTED) {
+
+ return(TRUE);
} else if (err == OS_FILE_ALREADY_EXISTS
|| err == OS_FILE_PATH_ERROR) {
@@ -806,7 +897,15 @@ next_file:
#ifdef HAVE_READDIR_R
ret = readdir_r(dir, (struct dirent*)dirent_buf, &ent);
- if (ret != 0) {
+ if (ret != 0
+#ifdef UNIV_AIX
+ /* On AIX, only if we got non-NULL 'ent' (result) value and
+ a non-zero 'ret' (return) value, it indicates a failed
+ readdir_r() call. An NULL 'ent' with an non-zero 'ret'
+ would indicate the "end of the directory" is reached. */
+ && ent != NULL
+#endif
+ ) {
fprintf(stderr,
"InnoDB: cannot read directory %s, error %lu\n",
dirname, (ulong)ret);
@@ -933,13 +1032,15 @@ os_file_create_directory(
}
/****************************************************************//**
+NOTE! Use the corresponding macro os_file_create_simple(), not directly
+this function!
A simple function to open or create a file.
@return own: handle to the file, not defined if error, error number
can be retrieved with os_file_get_last_error */
UNIV_INTERN
os_file_t
-os_file_create_simple(
-/*==================*/
+os_file_create_simple_func(
+/*=======================*/
const char* name, /*!< in: name of the file or path as a
null-terminated string */
ulint create_mode,/*!< in: OS_FILE_OPEN if an existing file is
@@ -1074,13 +1175,15 @@ try_again:
}
/****************************************************************//**
+NOTE! Use the corresponding macro
+os_file_create_simple_no_error_handling(), not directly this function!
A simple function to open or create a file.
@return own: handle to the file, not defined if error, error number
can be retrieved with os_file_get_last_error */
UNIV_INTERN
os_file_t
-os_file_create_simple_no_error_handling(
-/*====================================*/
+os_file_create_simple_no_error_handling_func(
+/*=========================================*/
const char* name, /*!< in: name of the file or path as a
null-terminated string */
ulint create_mode,/*!< in: OS_FILE_OPEN if an existing file
@@ -1229,13 +1332,15 @@ os_file_set_nocache(
}
/****************************************************************//**
+NOTE! Use the corresponding macro os_file_create(), not directly
+this function!
Opens an existing file or creates a new.
@return own: handle to the file, not defined if error, error number
can be retrieved with os_file_get_last_error */
UNIV_INTERN
os_file_t
-os_file_create(
-/*===========*/
+os_file_create_func(
+/*================*/
const char* name, /*!< in: name of the file or path as a
null-terminated string */
ulint create_mode,/*!< in: OS_FILE_OPEN if an existing file
@@ -1285,7 +1390,7 @@ try_again:
buffering of writes in the OS */
attributes = 0;
#ifdef WIN_ASYNC_IO
- if (os_aio_use_native_aio) {
+ if (srv_use_native_aio) {
attributes = attributes | FILE_FLAG_OVERLAPPED;
}
#endif
@@ -1620,13 +1725,14 @@ loop:
}
/***********************************************************************//**
+NOTE! Use the corresponding macro os_file_rename(), not directly this function!
Renames a file (can also move it to another directory). It is safest that the
file is closed before calling this function.
@return TRUE if success */
UNIV_INTERN
ibool
-os_file_rename(
-/*===========*/
+os_file_rename_func(
+/*================*/
const char* oldpath,/*!< in: old file path as a null-terminated
string */
const char* newpath)/*!< in: new file path */
@@ -1659,13 +1765,14 @@ os_file_rename(
}
/***********************************************************************//**
+NOTE! Use the corresponding macro os_file_close(), not directly this function!
Closes a file handle. In case of error, error number can be retrieved with
os_file_get_last_error.
@return TRUE if success */
UNIV_INTERN
ibool
-os_file_close(
-/*==========*/
+os_file_close_func(
+/*===============*/
os_file_t file) /*!< in, own: handle to a file */
{
#ifdef __WIN__
@@ -1961,12 +2068,13 @@ os_file_fsync(
#endif /* !__WIN__ */
/***********************************************************************//**
+NOTE! Use the corresponding macro os_file_flush(), not directly this function!
Flushes the write buffers of a given file to the disk.
@return TRUE if success */
UNIV_INTERN
ibool
-os_file_flush(
-/*==========*/
+os_file_flush_func(
+/*===============*/
os_file_t file) /*!< in, own: handle to a file */
{
#ifdef __WIN__
@@ -2273,12 +2381,14 @@ func_exit:
#endif
/*******************************************************************//**
+NOTE! Use the corresponding macro os_file_read(), not directly this
+function!
Requests a synchronous positioned read operation.
@return TRUE if request was successful, FALSE if fail */
UNIV_INTERN
ibool
-os_file_read(
-/*=========*/
+os_file_read_func(
+/*==============*/
os_file_t file, /*!< in: handle to a file */
void* buf, /*!< in: buffer where to read */
ulint offset, /*!< in: least significant 32 bits of file
@@ -2396,13 +2506,15 @@ error_handling:
}
/*******************************************************************//**
+NOTE! Use the corresponding macro os_file_read_no_error_handling(),
+not directly this function!
Requests a synchronous positioned read operation. This function does not do
any error handling. In case of error it returns FALSE.
@return TRUE if request was successful, FALSE if fail */
UNIV_INTERN
ibool
-os_file_read_no_error_handling(
-/*===========================*/
+os_file_read_no_error_handling_func(
+/*================================*/
os_file_t file, /*!< in: handle to a file */
void* buf, /*!< in: buffer where to read */
ulint offset, /*!< in: least significant 32 bits of file
@@ -2524,12 +2636,14 @@ os_file_read_string(
}
/*******************************************************************//**
+NOTE! Use the corresponding macro os_file_write(), not directly
+this function!
Requests a synchronous write operation.
@return TRUE if request was successful, FALSE if fail */
UNIV_INTERN
ibool
-os_file_write(
-/*==========*/
+os_file_write_func(
+/*===============*/
const char* name, /*!< in: name of the file or path as a
null-terminated string */
os_file_t file, /*!< in: handle to a file */
@@ -2988,15 +3102,106 @@ os_aio_array_get_nth_slot(
return((array->slots) + index);
}
-/************************************************************************//**
-Creates an aio wait array.
-@return own: aio array */
+#if defined(LINUX_NATIVE_AIO)
+/******************************************************************//**
+Creates an io_context for native linux AIO.
+@return TRUE on success. */
+static
+ibool
+os_aio_linux_create_io_ctx(
+/*=======================*/
+ ulint max_events, /*!< in: number of events. */
+ io_context_t* io_ctx) /*!< out: io_ctx to initialize. */
+{
+ int ret;
+ ulint retries = 0;
+
+retry:
+ memset(io_ctx, 0x0, sizeof(*io_ctx));
+
+ /* Initialize the io_ctx. Tell it how many pending
+ IO requests this context will handle. */
+
+ ret = io_setup(max_events, io_ctx);
+ if (ret == 0) {
+#if defined(UNIV_AIO_DEBUG)
+ fprintf(stderr,
+ "InnoDB: Linux native AIO:"
+ " initialized io_ctx for segment\n");
+#endif
+ /* Success. Return now. */
+ return(TRUE);
+ }
+
+ /* If we hit EAGAIN we'll make a few attempts before failing. */
+
+ switch (ret) {
+ case -EAGAIN:
+ if (retries == 0) {
+ /* First time around. */
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Warning: io_setup() failed"
+ " with EAGAIN. Will make %d attempts"
+ " before giving up.\n",
+ OS_AIO_IO_SETUP_RETRY_ATTEMPTS);
+ }
+
+ if (retries < OS_AIO_IO_SETUP_RETRY_ATTEMPTS) {
+ ++retries;
+ fprintf(stderr,
+ "InnoDB: Warning: io_setup() attempt"
+ " %lu failed.\n",
+ retries);
+ os_thread_sleep(OS_AIO_IO_SETUP_RETRY_SLEEP);
+ goto retry;
+ }
+
+ /* Have tried enough. Better call it a day. */
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Error: io_setup() failed"
+ " with EAGAIN after %d attempts.\n",
+ OS_AIO_IO_SETUP_RETRY_ATTEMPTS);
+ break;
+
+ case -ENOSYS:
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Error: Linux Native AIO interface"
+ " is not supported on this platform. Please"
+ " check your OS documentation and install"
+ " appropriate binary of InnoDB.\n");
+
+ break;
+
+ default:
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Error: Linux Native AIO setup"
+ " returned following error[%d]\n", -ret);
+ break;
+ }
+
+ fprintf(stderr,
+ "InnoDB: You can disable Linux Native AIO by"
+ " setting innodb_native_aio = off in my.cnf\n");
+ return(FALSE);
+}
+#endif /* LINUX_NATIVE_AIO */
+
+/******************************************************************//**
+Creates an aio wait array. Note that we return NULL in case of failure.
+We don't care about freeing memory here because we assume that a
+failure will result in server refusing to start up.
+@return own: aio array, NULL on failure */
static
os_aio_array_t*
os_aio_array_create(
/*================*/
- ulint n, /*!< in: maximum number of pending aio operations
- allowed; n must be divisible by n_segments */
+ ulint n, /*!< in: maximum number of pending aio
+ operations allowed; n must be
+ divisible by n_segments */
ulint n_segments) /*!< in: number of segments in the aio array */
{
os_aio_array_t* array;
@@ -3004,6 +3209,8 @@ os_aio_array_create(
os_aio_slot_t* slot;
#ifdef WIN_ASYNC_IO
OVERLAPPED* over;
+#elif defined(LINUX_NATIVE_AIO)
+ struct io_event* io_event = NULL;
#endif
ut_a(n > 0);
ut_a(n_segments > 0);
@@ -3019,10 +3226,44 @@ os_aio_array_create(
array->n_slots = n;
array->n_segments = n_segments;
array->n_reserved = 0;
+ array->cur_seg = 0;
array->slots = ut_malloc(n * sizeof(os_aio_slot_t));
#ifdef __WIN__
array->native_events = ut_malloc(n * sizeof(os_native_event_t));
#endif
+
+#if defined(LINUX_NATIVE_AIO)
+ /* If we are not using native aio interface then skip this
+ part of initialization. */
+ if (!srv_use_native_aio) {
+ goto skip_native_aio;
+ }
+
+ /* Initialize the io_context array. One io_context
+ per segment in the array. */
+
+ array->aio_ctx = ut_malloc(n_segments *
+ sizeof(*array->aio_ctx));
+ for (i = 0; i < n_segments; ++i) {
+ if (!os_aio_linux_create_io_ctx(n/n_segments,
+ &array->aio_ctx[i])) {
+ /* If something bad happened during aio setup
+ we should call it a day and return right away.
+ We don't care about any leaks because a failure
+ to initialize the io subsystem means that the
+ server (or atleast the innodb storage engine)
+ is not going to startup. */
+ return(NULL);
+ }
+ }
+
+ /* Initialize the event array. One event per slot. */
+ io_event = ut_malloc(n * sizeof(*io_event));
+ memset(io_event, 0x0, sizeof(*io_event) * n);
+ array->aio_events = io_event;
+
+skip_native_aio:
+#endif /* LINUX_NATIVE_AIO */
for (i = 0; i < n; i++) {
slot = os_aio_array_get_nth_slot(array, i);
@@ -3036,6 +3277,12 @@ os_aio_array_create(
over->hEvent = slot->event->handle;
*((array->native_events) + i) = over->hEvent;
+
+#elif defined(LINUX_NATIVE_AIO)
+
+ memset(&slot->control, 0x0, sizeof(slot->control));
+ slot->n_bytes = 0;
+ slot->ret = 0;
#endif
}
@@ -3078,7 +3325,7 @@ respectively. The caller must create an i/o handler thread for each
segment in these arrays. This function also creates the sync array.
No i/o handler thread needs to be created for that */
UNIV_INTERN
-void
+ibool
os_aio_init(
/*========*/
ulint n_per_seg, /*<! in: maximum number of pending aio
@@ -3103,15 +3350,25 @@ os_aio_init(
/* fprintf(stderr, "Array n per seg %lu\n", n_per_seg); */
os_aio_ibuf_array = os_aio_array_create(n_per_seg, 1);
+ if (os_aio_ibuf_array == NULL) {
+ goto err_exit;
+ }
srv_io_thread_function[0] = "insert buffer thread";
os_aio_log_array = os_aio_array_create(n_per_seg, 1);
+ if (os_aio_log_array == NULL) {
+ goto err_exit;
+ }
srv_io_thread_function[1] = "log thread";
os_aio_read_array = os_aio_array_create(n_read_segs * n_per_seg,
n_read_segs);
+ if (os_aio_read_array == NULL) {
+ goto err_exit;
+ }
+
for (i = 2; i < 2 + n_read_segs; i++) {
ut_a(i < SRV_MAX_N_IO_THREADS);
srv_io_thread_function[i] = "read thread";
@@ -3119,12 +3376,20 @@ os_aio_init(
os_aio_write_array = os_aio_array_create(n_write_segs * n_per_seg,
n_write_segs);
+ if (os_aio_write_array == NULL) {
+ goto err_exit;
+ }
+
for (i = 2 + n_read_segs; i < n_segments; i++) {
ut_a(i < SRV_MAX_N_IO_THREADS);
srv_io_thread_function[i] = "write thread";
}
os_aio_sync_array = os_aio_array_create(n_slots_sync, 1);
+ if (os_aio_sync_array == NULL) {
+ goto err_exit;
+ }
+
os_aio_n_segments = n_segments;
@@ -3138,6 +3403,11 @@ os_aio_init(
os_last_printout = time(NULL);
+ return(TRUE);
+
+err_exit:
+ return(FALSE);
+
}
/***********************************************************************
@@ -3204,6 +3474,19 @@ os_aio_wake_all_threads_at_shutdown(void)
os_aio_array_wake_win_aio_at_shutdown(os_aio_write_array);
os_aio_array_wake_win_aio_at_shutdown(os_aio_ibuf_array);
os_aio_array_wake_win_aio_at_shutdown(os_aio_log_array);
+
+#elif defined(LINUX_NATIVE_AIO)
+
+ /* When using native AIO interface the io helper threads
+ wait on io_getevents with a timeout value of 500ms. At
+ each wake up these threads check the server status.
+ No need to do anything to wake them up. */
+
+ if (srv_use_native_aio) {
+ return;
+ }
+ /* Fall through to simulated AIO handler wakeup if we are
+ not using native AIO. */
#endif
/* This loop wakes up all simulated ai/o threads */
@@ -3321,11 +3604,18 @@ os_aio_array_reserve_slot(
offset */
ulint len) /*!< in: length of the block to read or write */
{
- os_aio_slot_t* slot;
+ os_aio_slot_t* slot = NULL;
#ifdef WIN_ASYNC_IO
OVERLAPPED* control;
+
+#elif defined(LINUX_NATIVE_AIO)
+
+ struct iocb* iocb;
+ off_t aio_offset;
+
#endif
ulint i;
+ ulint counter;
ulint slots_per_seg;
ulint local_seg;
@@ -3344,7 +3634,7 @@ loop:
if (array->n_reserved == array->n_slots) {
os_mutex_exit(array->mutex);
- if (!os_aio_use_native_aio) {
+ if (!srv_use_native_aio) {
/* If the handler threads are suspended, wake them
so that we get more slots */
@@ -3356,17 +3646,13 @@ loop:
goto loop;
}
- /* First try to find a slot in the preferred local segment */
- for (i = local_seg * slots_per_seg; i < array->n_slots; i++) {
- slot = os_aio_array_get_nth_slot(array, i);
+ /* We start our search for an available slot from our preferred
+ local segment and do a full scan of the array. We are
+ guaranteed to find a slot in full scan. */
+ for (i = local_seg * slots_per_seg, counter = 0;
+ counter < array->n_slots; i++, counter++) {
- if (slot->reserved == FALSE) {
- goto found;
- }
- }
-
- /* Fall back to a full scan. We are guaranteed to find a slot */
- for (i = 0;; i++) {
+ i %= array->n_slots;
slot = os_aio_array_get_nth_slot(array, i);
if (slot->reserved == FALSE) {
@@ -3374,6 +3660,9 @@ loop:
}
}
+ /* We MUST always be able to get hold of a reserved slot. */
+ ut_error;
+
found:
ut_a(slot->reserved == FALSE);
array->n_reserved++;
@@ -3404,8 +3693,42 @@ found:
control->Offset = (DWORD)offset;
control->OffsetHigh = (DWORD)offset_high;
os_event_reset(slot->event);
-#endif
+#elif defined(LINUX_NATIVE_AIO)
+
+ /* If we are not using native AIO skip this part. */
+ if (!srv_use_native_aio) {
+ goto skip_native_aio;
+ }
+
+ /* Check if we are dealing with 64 bit arch.
+ If not then make sure that offset fits in 32 bits. */
+ if (sizeof(aio_offset) == 8) {
+ aio_offset = offset_high;
+ aio_offset <<= 32;
+ aio_offset += offset;
+ } else {
+ ut_a(offset_high == 0);
+ aio_offset = offset;
+ }
+
+ iocb = &slot->control;
+
+ if (type == OS_FILE_READ) {
+ io_prep_pread(iocb, file, buf, len, aio_offset);
+ } else {
+ ut_a(type == OS_FILE_WRITE);
+ io_prep_pwrite(iocb, file, buf, len, aio_offset);
+ }
+
+ iocb->data = (void*)slot;
+ slot->n_bytes = 0;
+ slot->ret = 0;
+ /*fprintf(stderr, "Filled up Linux native iocb.\n");*/
+
+
+skip_native_aio:
+#endif /* LINUX_NATIVE_AIO */
os_mutex_exit(array->mutex);
return(slot);
@@ -3440,7 +3763,23 @@ os_aio_array_free_slot(
}
#ifdef WIN_ASYNC_IO
+
os_event_reset(slot->event);
+
+#elif defined(LINUX_NATIVE_AIO)
+
+ if (srv_use_native_aio) {
+ memset(&slot->control, 0x0, sizeof(slot->control));
+ slot->n_bytes = 0;
+ slot->ret = 0;
+ /*fprintf(stderr, "Freed up Linux native slot.\n");*/
+ } else {
+ /* These fields should not be used if we are not
+ using native AIO. */
+ ut_ad(slot->n_bytes == 0);
+ ut_ad(slot->ret == 0);
+ }
+
#endif
os_mutex_exit(array->mutex);
}
@@ -3460,7 +3799,7 @@ os_aio_simulated_wake_handler_thread(
ulint n;
ulint i;
- ut_ad(!os_aio_use_native_aio);
+ ut_ad(!srv_use_native_aio);
segment = os_aio_get_array_and_local_segment(&array, global_segment);
@@ -3496,7 +3835,7 @@ os_aio_simulated_wake_handler_threads(void)
{
ulint i;
- if (os_aio_use_native_aio) {
+ if (srv_use_native_aio) {
/* We do not use simulated aio: do nothing */
return;
@@ -3528,7 +3867,7 @@ readahead requests. */
os_aio_array_t* array;
ulint g;
- if (os_aio_use_native_aio) {
+ if (srv_use_native_aio) {
/* We do not use simulated aio: do nothing */
return;
@@ -3547,13 +3886,62 @@ readahead requests. */
#endif /* __WIN__ */
}
+#if defined(LINUX_NATIVE_AIO)
/*******************************************************************//**
+Dispatch an AIO request to the kernel.
+@return TRUE on success. */
+static
+ibool
+os_aio_linux_dispatch(
+/*==================*/
+ os_aio_array_t* array, /*!< in: io request array. */
+ os_aio_slot_t* slot) /*!< in: an already reserved slot. */
+{
+ int ret;
+ ulint io_ctx_index;
+ struct iocb* iocb;
+
+ ut_ad(slot != NULL);
+ ut_ad(array);
+
+ ut_a(slot->reserved);
+
+ /* Find out what we are going to work with.
+ The iocb struct is directly in the slot.
+ The io_context is one per segment. */
+
+ iocb = &slot->control;
+ io_ctx_index = (slot->pos * array->n_segments) / array->n_slots;
+
+ ret = io_submit(array->aio_ctx[io_ctx_index], 1, &iocb);
+
+#if defined(UNIV_AIO_DEBUG)
+ fprintf(stderr,
+ "io_submit[%c] ret[%d]: slot[%p] ctx[%p] seg[%lu]\n",
+ (slot->type == OS_FILE_WRITE) ? 'w' : 'r', ret, slot,
+ array->aio_ctx[io_ctx_index], (ulong)io_ctx_index);
+#endif
+
+ /* io_submit returns number of successfully
+ queued requests or -errno. */
+ if (UNIV_UNLIKELY(ret != 1)) {
+ errno = -ret;
+ return(FALSE);
+ }
+
+ return(TRUE);
+}
+#endif /* LINUX_NATIVE_AIO */
+
+
+/*******************************************************************//**
+NOTE! Use the corresponding macro os_aio(), not directly this function!
Requests an asynchronous i/o operation.
@return TRUE if request was queued successfully, FALSE if fail */
UNIV_INTERN
ibool
-os_aio(
-/*===*/
+os_aio_func(
+/*========*/
ulint type, /*!< in: OS_FILE_READ or OS_FILE_WRITE */
ulint mode, /*!< in: OS_AIO_NORMAL, ..., possibly ORed
to OS_AIO_SIMULATED_WAKE_LATER: the
@@ -3596,8 +3984,7 @@ os_aio(
struct fil_node_struct * dummy_mess1;
void* dummy_mess2;
ulint dummy_type;
-#endif
- ulint err = 0;
+#endif /* WIN_ASYNC_IO */
ibool retry;
ulint wake_later;
@@ -3613,8 +4000,8 @@ os_aio(
if (mode == OS_AIO_SYNC
#ifdef WIN_ASYNC_IO
- && !os_aio_use_native_aio
-#endif
+ && !srv_use_native_aio
+#endif /* WIN_ASYNC_IO */
) {
/* This is actually an ordinary synchronous read or write:
no need to use an i/o-handler thread. NOTE that if we use
@@ -3653,6 +4040,11 @@ try_again:
array = os_aio_log_array;
} else if (mode == OS_AIO_SYNC) {
array = os_aio_sync_array;
+
+#if defined(LINUX_NATIVE_AIO)
+ /* In Linux native AIO we don't use sync IO array. */
+ ut_a(!srv_use_native_aio);
+#endif /* LINUX_NATIVE_AIO */
} else {
array = NULL; /* Eliminate compiler warning */
ut_error;
@@ -3661,13 +4053,17 @@ try_again:
slot = os_aio_array_reserve_slot(type, array, message1, message2, file,
name, buf, offset, offset_high, n);
if (type == OS_FILE_READ) {
- if (os_aio_use_native_aio) {
-#ifdef WIN_ASYNC_IO
+ if (srv_use_native_aio) {
os_n_file_reads++;
- os_bytes_read_since_printout += len;
-
+ os_bytes_read_since_printout += n;
+#ifdef WIN_ASYNC_IO
ret = ReadFile(file, buf, (DWORD)n, &len,
&(slot->control));
+
+#elif defined(LINUX_NATIVE_AIO)
+ if (!os_aio_linux_dispatch(array, slot)) {
+ goto err_exit;
+ }
#endif
} else {
if (!wake_later) {
@@ -3677,11 +4073,16 @@ try_again:
}
}
} else if (type == OS_FILE_WRITE) {
- if (os_aio_use_native_aio) {
-#ifdef WIN_ASYNC_IO
+ if (srv_use_native_aio) {
os_n_file_writes++;
+#ifdef WIN_ASYNC_IO
ret = WriteFile(file, buf, (DWORD)n, &len,
&(slot->control));
+
+#elif defined(LINUX_NATIVE_AIO)
+ if (!os_aio_linux_dispatch(array, slot)) {
+ goto err_exit;
+ }
#endif
} else {
if (!wake_later) {
@@ -3695,7 +4096,7 @@ try_again:
}
#ifdef WIN_ASYNC_IO
- if (os_aio_use_native_aio) {
+ if (srv_use_native_aio) {
if ((ret && len == n)
|| (!ret && GetLastError() == ERROR_IO_PENDING)) {
/* aio was queued successfully! */
@@ -3718,15 +4119,15 @@ try_again:
return(TRUE);
}
- err = 1; /* Fall through the next if */
- }
-#endif
- if (err == 0) {
- /* aio was queued successfully! */
-
- return(TRUE);
+ goto err_exit;
}
+#endif /* WIN_ASYNC_IO */
+ /* aio was queued successfully! */
+ return(TRUE);
+#if defined LINUX_NATIVE_AIO || defined WIN_ASYNC_IO
+err_exit:
+#endif /* LINUX_NATIVE_AIO || WIN_ASYNC_IO */
os_aio_array_free_slot(array, slot);
retry = os_file_handle_error(name,
@@ -3830,7 +4231,9 @@ os_aio_windows_handle(
#ifdef UNIV_DO_FLUSH
if (slot->type == OS_FILE_WRITE
&& !os_do_not_call_flush_at_each_write) {
- ut_a(TRUE == os_file_flush(slot->file));
+ if (!os_file_flush(slot->file)) {
+ ut_error;
+ }
}
#endif /* UNIV_DO_FLUSH */
} else if (os_file_handle_error(slot->name, "Windows aio")) {
@@ -3847,6 +4250,18 @@ os_aio_windows_handle(
/* retry failed read/write operation synchronously.
No need to hold array->mutex. */
+#ifdef UNIV_PFS_IO
+ /* This read/write does not go through os_file_read
+ and os_file_write APIs, need to register with
+ performance schema explicitly here. */
+ struct PSI_file_locker* locker = NULL;
+ register_pfs_file_io_begin(locker, slot->file, slot->len,
+ (slot->type == OS_FILE_WRITE)
+ ? PSI_FILE_WRITE
+ : PSI_FILE_READ,
+ __FILE__, __LINE__);
+#endif
+
switch (slot->type) {
case OS_FILE_WRITE:
ret = WriteFile(slot->file, slot->buf,
@@ -3864,6 +4279,10 @@ os_aio_windows_handle(
ut_error;
}
+#ifdef UNIV_PFS_IO
+ register_pfs_file_io_end(locker, len);
+#endif
+
if (!ret && GetLastError() == ERROR_IO_PENDING) {
/* aio was queued successfully!
We want a synchronous i/o operation on a
@@ -3885,6 +4304,256 @@ os_aio_windows_handle(
}
#endif
+#if defined(LINUX_NATIVE_AIO)
+/******************************************************************//**
+This function is only used in Linux native asynchronous i/o. This is
+called from within the io-thread. If there are no completed IO requests
+in the slot array, the thread calls this function to collect more
+requests from the kernel.
+The io-thread waits on io_getevents(), which is a blocking call, with
+a timeout value. Unless the system is very heavy loaded, keeping the
+io-thread very busy, the io-thread will spend most of its time waiting
+in this function.
+The io-thread also exits in this function. It checks server status at
+each wakeup and that is why we use timed wait in io_getevents(). */
+static
+void
+os_aio_linux_collect(
+/*=================*/
+ os_aio_array_t* array, /*!< in/out: slot array. */
+ ulint segment, /*!< in: local segment no. */
+ ulint seg_size) /*!< in: segment size. */
+{
+ int i;
+ int ret;
+ ulint start_pos;
+ ulint end_pos;
+ struct timespec timeout;
+ struct io_event* events;
+ struct io_context* io_ctx;
+
+ /* sanity checks. */
+ ut_ad(array != NULL);
+ ut_ad(seg_size > 0);
+ ut_ad(segment < array->n_segments);
+
+ /* Which part of event array we are going to work on. */
+ events = &array->aio_events[segment * seg_size];
+
+ /* Which io_context we are going to use. */
+ io_ctx = array->aio_ctx[segment];
+
+ /* Starting point of the segment we will be working on. */
+ start_pos = segment * seg_size;
+
+ /* End point. */
+ end_pos = start_pos + seg_size;
+
+retry:
+
+ /* Go down if we are in shutdown mode.
+ In case of srv_fast_shutdown == 2, there may be pending
+ IO requests but that should be OK as we essentially treat
+ that as a crash of InnoDB. */
+ if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
+ os_thread_exit(NULL);
+ }
+
+ /* Initialize the events. The timeout value is arbitrary.
+ We probably need to experiment with it a little. */
+ memset(events, 0, sizeof(*events) * seg_size);
+ timeout.tv_sec = 0;
+ timeout.tv_nsec = OS_AIO_REAP_TIMEOUT;
+
+ ret = io_getevents(io_ctx, 1, seg_size, events, &timeout);
+
+ /* This error handling is for any error in collecting the
+ IO requests. The errors, if any, for any particular IO
+ request are simply passed on to the calling routine. */
+
+ /* Not enough resources! Try again. */
+ if (ret == -EAGAIN) {
+ goto retry;
+ }
+
+ /* Interrupted! I have tested the behaviour in case of an
+ interrupt. If we have some completed IOs available then
+ the return code will be the number of IOs. We get EINTR only
+ if there are no completed IOs and we have been interrupted. */
+ if (ret == -EINTR) {
+ goto retry;
+ }
+
+ /* No pending request! Go back and check again. */
+ if (ret == 0) {
+ goto retry;
+ }
+
+ /* All other errors! should cause a trap for now. */
+ if (UNIV_UNLIKELY(ret < 0)) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: unexpected ret_code[%d] from"
+ " io_getevents()!\n", ret);
+ ut_error;
+ }
+
+ ut_a(ret > 0);
+
+ for (i = 0; i < ret; i++) {
+ os_aio_slot_t* slot;
+ struct iocb* control;
+
+ control = (struct iocb *)events[i].obj;
+ ut_a(control != NULL);
+
+ slot = (os_aio_slot_t *) control->data;
+
+ /* Some sanity checks. */
+ ut_a(slot != NULL);
+ ut_a(slot->reserved);
+
+#if defined(UNIV_AIO_DEBUG)
+ fprintf(stderr,
+ "io_getevents[%c]: slot[%p] ctx[%p]"
+ " seg[%lu]\n",
+ (slot->type == OS_FILE_WRITE) ? 'w' : 'r',
+ slot, io_ctx, segment);
+#endif
+
+ /* We are not scribbling previous segment. */
+ ut_a(slot->pos >= start_pos);
+
+ /* We have not overstepped to next segment. */
+ ut_a(slot->pos < end_pos);
+
+ /* Mark this request as completed. The error handling
+ will be done in the calling function. */
+ os_mutex_enter(array->mutex);
+ slot->n_bytes = events[i].res;
+ slot->ret = events[i].res2;
+ slot->io_already_done = TRUE;
+ os_mutex_exit(array->mutex);
+ }
+
+ return;
+}
+
+/**********************************************************************//**
+This function is only used in Linux native asynchronous i/o.
+Waits for an aio operation to complete. This function is used to wait for
+the completed requests. The aio array of pending requests is divided
+into segments. The thread specifies which segment or slot it wants to wait
+for. NOTE: this function will also take care of freeing the aio slot,
+therefore no other thread is allowed to do the freeing!
+@return TRUE if the IO was successful */
+UNIV_INTERN
+ibool
+os_aio_linux_handle(
+/*================*/
+ ulint global_seg, /*!< in: segment number in the aio array
+ to wait for; segment 0 is the ibuf
+ i/o thread, segment 1 is log i/o thread,
+ then follow the non-ibuf read threads,
+ and the last are the non-ibuf write
+ threads. */
+ fil_node_t**message1, /*!< out: the messages passed with the */
+ void** message2, /*!< aio request; note that in case the
+ aio operation failed, these output
+ parameters are valid and can be used to
+ restart the operation. */
+ ulint* type) /*!< out: OS_FILE_WRITE or ..._READ */
+{
+ ulint segment;
+ os_aio_array_t* array;
+ os_aio_slot_t* slot;
+ ulint n;
+ ulint i;
+ ibool ret = FALSE;
+
+ /* Should never be doing Sync IO here. */
+ ut_a(global_seg != ULINT_UNDEFINED);
+
+ /* Find the array and the local segment. */
+ segment = os_aio_get_array_and_local_segment(&array, global_seg);
+ n = array->n_slots / array->n_segments;
+
+ /* Loop until we have found a completed request. */
+ for (;;) {
+ os_mutex_enter(array->mutex);
+ for (i = 0; i < n; ++i) {
+ slot = os_aio_array_get_nth_slot(
+ array, i + segment * n);
+ if (slot->reserved && slot->io_already_done) {
+ /* Something for us to work on. */
+ goto found;
+ }
+ }
+
+ os_mutex_exit(array->mutex);
+
+ /* We don't have any completed request.
+ Wait for some request. Note that we return
+ from wait iff we have found a request. */
+
+ srv_set_io_thread_op_info(global_seg,
+ "waiting for completed aio requests");
+ os_aio_linux_collect(array, segment, n);
+ }
+
+found:
+ /* Note that it may be that there are more then one completed
+ IO requests. We process them one at a time. We may have a case
+ here to improve the performance slightly by dealing with all
+ requests in one sweep. */
+ srv_set_io_thread_op_info(global_seg,
+ "processing completed aio requests");
+
+ /* Ensure that we are scribbling only our segment. */
+ ut_a(i < n);
+
+ ut_ad(slot != NULL);
+ ut_ad(slot->reserved);
+ ut_ad(slot->io_already_done);
+
+ *message1 = slot->message1;
+ *message2 = slot->message2;
+
+ *type = slot->type;
+
+ if ((slot->ret == 0) && (slot->n_bytes == (long)slot->len)) {
+ ret = TRUE;
+
+#ifdef UNIV_DO_FLUSH
+ if (slot->type == OS_FILE_WRITE
+ && !os_do_not_call_flush_at_each_write)
+ && !os_file_flush(slot->file) {
+ ut_error;
+ }
+#endif /* UNIV_DO_FLUSH */
+ } else {
+ errno = -slot->ret;
+
+ /* os_file_handle_error does tell us if we should retry
+ this IO. As it stands now, we don't do this retry when
+ reaping requests from a different context than
+ the dispatcher. This non-retry logic is the same for
+ windows and linux native AIO.
+ We should probably look into this to transparently
+ re-submit the IO. */
+ os_file_handle_error(slot->name, "Linux aio");
+
+ ret = FALSE;
+ }
+
+ os_mutex_exit(array->mutex);
+
+ os_aio_array_free_slot(array, slot);
+
+ return(ret);
+}
+#endif /* LINUX_NATIVE_AIO */
+
/**********************************************************************//**
Does simulated aio. This function should be called by an i/o-handler
thread.
@@ -3923,6 +4592,9 @@ os_aio_simulated_handle(
ulint n;
ulint i;
+ /* Fix compiler warning */
+ *consecutive_ios = NULL;
+
segment = os_aio_get_array_and_local_segment(&array, global_segment);
restart:
@@ -4260,6 +4932,40 @@ os_aio_validate(void)
}
/**********************************************************************//**
+Prints pending IO requests per segment of an aio array.
+We probably don't need per segment statistics but they can help us
+during development phase to see if the IO requests are being
+distributed as expected. */
+static
+void
+os_aio_print_segment_info(
+/*======================*/
+ FILE* file, /*!< in: file where to print */
+ ulint* n_seg, /*!< in: pending IO array */
+ os_aio_array_t* array) /*!< in: array to process */
+{
+ ulint i;
+
+ ut_ad(array);
+ ut_ad(n_seg);
+ ut_ad(array->n_segments > 0);
+
+ if (array->n_segments == 1) {
+ return;
+ }
+
+ fprintf(file, " [");
+ for (i = 0; i < array->n_segments; i++) {
+ if (i != 0) {
+ fprintf(file, ", ");
+ }
+
+ fprintf(file, "%lu", n_seg[i]);
+ }
+ fprintf(file, "] ");
+}
+
+/**********************************************************************//**
Prints info of the aio arrays. */
UNIV_INTERN
void
@@ -4270,6 +4976,7 @@ os_aio_print(
os_aio_array_t* array;
os_aio_slot_t* slot;
ulint n_reserved;
+ ulint n_res_seg[SRV_MAX_N_IO_THREADS];
time_t current_time;
double time_elapsed;
double avg_bytes_read;
@@ -4302,11 +5009,17 @@ loop:
n_reserved = 0;
+ memset(n_res_seg, 0x0, sizeof(n_res_seg));
+
for (i = 0; i < array->n_slots; i++) {
+ ulint seg_no;
+
slot = os_aio_array_get_nth_slot(array, i);
+ seg_no = (i * array->n_segments) / array->n_slots;
if (slot->reserved) {
n_reserved++;
+ n_res_seg[seg_no]++;
#if 0
fprintf(stderr, "Reserved slot, messages %p %p\n",
(void*) slot->message1,
@@ -4320,6 +5033,8 @@ loop:
fprintf(file, " %lu", (ulong) n_reserved);
+ os_aio_print_segment_info(file, n_res_seg, array);
+
os_mutex_exit(array->mutex);
if (array == os_aio_read_array) {
diff --git a/storage/innobase/os/os0thread.c b/storage/innobase/os/os0thread.c
index ac733373646..78df66d7834 100644
--- a/storage/innobase/os/os0thread.c
+++ b/storage/innobase/os/os0thread.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -212,6 +212,11 @@ os_thread_exit(
fprintf(stderr, "Thread exits, id %lu\n",
os_thread_pf(os_thread_get_curr_id()));
#endif
+
+#ifdef UNIV_PFS_THREAD
+ pfs_delete_thread();
+#endif
+
os_mutex_enter(os_sync_mutex);
os_thread_count--;
os_mutex_exit(os_sync_mutex);
diff --git a/storage/innobase/page/page0page.c b/storage/innobase/page/page0page.c
index ab2ba60570e..10008f9ac25 100644
--- a/storage/innobase/page/page0page.c
+++ b/storage/innobase/page/page0page.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -658,6 +658,14 @@ page_copy_rec_list_end(
index, mtr);
}
+ /* Update PAGE_MAX_TRX_ID on the uncompressed page.
+ Modifications will be redo logged and copied to the compressed
+ page in page_zip_compress() or page_zip_reorganize() below. */
+ if (dict_index_is_sec_or_ibuf(index) && page_is_leaf(page)) {
+ page_update_max_trx_id(new_block, NULL,
+ page_get_max_trx_id(page), mtr);
+ }
+
if (UNIV_LIKELY_NULL(new_page_zip)) {
mtr_set_log_mode(mtr, log_mode);
@@ -696,15 +704,10 @@ page_copy_rec_list_end(
}
}
- /* Update the lock table, MAX_TRX_ID, and possible hash index */
+ /* Update the lock table and possible hash index */
lock_move_rec_list_end(new_block, block, rec);
- if (dict_index_is_sec_or_ibuf(index) && page_is_leaf(page)) {
- page_update_max_trx_id(new_block, new_page_zip,
- page_get_max_trx_id(page), mtr);
- }
-
btr_search_move_or_delete_hash_entries(new_block, block, index);
return(ret);
@@ -772,6 +775,16 @@ page_copy_rec_list_start(
mem_heap_free(heap);
}
+ /* Update PAGE_MAX_TRX_ID on the uncompressed page.
+ Modifications will be redo logged and copied to the compressed
+ page in page_zip_compress() or page_zip_reorganize() below. */
+ if (dict_index_is_sec_or_ibuf(index)
+ && page_is_leaf(page_align(rec))) {
+ page_update_max_trx_id(new_block, NULL,
+ page_get_max_trx_id(page_align(rec)),
+ mtr);
+ }
+
if (UNIV_LIKELY_NULL(new_page_zip)) {
mtr_set_log_mode(mtr, log_mode);
@@ -809,14 +822,7 @@ page_copy_rec_list_start(
}
}
- /* Update MAX_TRX_ID, the lock table, and possible hash index */
-
- if (dict_index_is_sec_or_ibuf(index)
- && page_is_leaf(page_align(rec))) {
- page_update_max_trx_id(new_block, new_page_zip,
- page_get_max_trx_id(page_align(rec)),
- mtr);
- }
+ /* Update the lock table and possible hash index */
lock_move_rec_list_start(new_block, block, rec, ret);
@@ -2408,8 +2414,13 @@ page_validate(
}
offs = page_offset(rec_get_start(rec, offsets));
+ i = rec_offs_size(offsets);
+ if (UNIV_UNLIKELY(offs + i >= UNIV_PAGE_SIZE)) {
+ fputs("InnoDB: record offset out of bounds\n", stderr);
+ goto func_exit;
+ }
- for (i = rec_offs_size(offsets); i--; ) {
+ while (i--) {
if (UNIV_UNLIKELY(buf[offs + i])) {
/* No other record may overlap this */
@@ -2517,8 +2528,13 @@ n_owned_zero:
count++;
offs = page_offset(rec_get_start(rec, offsets));
+ i = rec_offs_size(offsets);
+ if (UNIV_UNLIKELY(offs + i >= UNIV_PAGE_SIZE)) {
+ fputs("InnoDB: record offset out of bounds\n", stderr);
+ goto func_exit;
+ }
- for (i = rec_offs_size(offsets); i--; ) {
+ while (i--) {
if (UNIV_UNLIKELY(buf[offs + i])) {
fputs("InnoDB: Record overlaps another"
diff --git a/storage/innobase/pars/pars0pars.c b/storage/innobase/pars/pars0pars.c
index 9faf36d00a8..613e7962f0e 100644
--- a/storage/innobase/pars/pars0pars.c
+++ b/storage/innobase/pars/pars0pars.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -2031,6 +2031,29 @@ pars_info_add_int4_literal(
Equivalent to:
char buf[8];
+mach_write_ull(buf, val);
+pars_info_add_literal(info, name, buf, 8, DATA_INT, 0);
+
+except that the buffer is dynamically allocated from the info struct's
+heap. */
+UNIV_INTERN
+void
+pars_info_add_uint64_literal(
+/*=========================*/
+ pars_info_t* info, /*!< in: info struct */
+ const char* name, /*!< in: name */
+ ib_uint64_t val) /*!< in: value */
+{
+ byte* buf = mem_heap_alloc(info->heap, 8);
+
+ mach_write_ull(buf, val);
+ pars_info_add_literal(info, name, buf, 8, DATA_INT, 0);
+}
+
+/****************************************************************//**
+Equivalent to:
+
+char buf[8];
mach_write_to_8(buf, val);
pars_info_add_literal(info, name, buf, 8, DATA_FIXBINARY, 0);
diff --git a/storage/innobase/plug.in b/storage/innobase/plug.in
index 09a95ecc157..4ca1b520526 100644
--- a/storage/innobase/plug.in
+++ b/storage/innobase/plug.in
@@ -1,5 +1,5 @@
#
-# Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+# Copyright (c) 2006, 2010, Innobase Oy. All Rights Reserved.
#
# This program is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
@@ -14,7 +14,7 @@
# Place, Suite 330, Boston, MA 02111-1307 USA
#
-MYSQL_STORAGE_ENGINE(innobase, innodb, [InnoDB Storage Engine],
+MYSQL_STORAGE_ENGINE(innobase, innodb, [InnoDB Storage Engine],
[Transactional Tables using InnoDB], [max,max-no-ndb])
MYSQL_PLUGIN_DIRECTORY(innobase, [storage/innobase])
MYSQL_PLUGIN_STATIC(innobase, [libinnobase.a])
@@ -28,6 +28,14 @@ MYSQL_PLUGIN_ACTIONS(innobase, [
AC_C_BIGENDIAN
case "$target_os" in
lin*)
+ AC_CHECK_HEADER(libaio.h,
+ AC_CHECK_LIB(aio, io_setup,
+ LIBS="$LIBS -laio"
+ AC_DEFINE(LINUX_NATIVE_AIO, [1],
+ [Linux native async I/O support]),
+ AC_MSG_WARN([No Linux native async I/O])),
+ AC_MSG_WARN([No Linux native async I/O]))
+
CFLAGS="$CFLAGS -DUNIV_LINUX";;
hpux10*)
CFLAGS="$CFLAGS -DUNIV_MUST_NOT_INLINE -DUNIV_HPUX -DUNIV_HPUX10";;
diff --git a/storage/innobase/rem/rem0rec.c b/storage/innobase/rem/rem0rec.c
index 1c8b3fd8c1e..27c11dacc8c 100644
--- a/storage/innobase/rem/rem0rec.c
+++ b/storage/innobase/rem/rem0rec.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -695,19 +695,9 @@ rec_get_nth_field_offs_old(
ulint os;
ulint next_os;
- ut_ad(rec && len);
- ut_ad(n < rec_get_n_fields_old(rec));
-
- if (UNIV_UNLIKELY(n > REC_MAX_N_FIELDS)) {
- fprintf(stderr, "Error: trying to access field %lu in rec\n",
- (ulong) n);
- ut_error;
- }
-
- if (UNIV_UNLIKELY(rec == NULL)) {
- fputs("Error: rec is NULL pointer\n", stderr);
- ut_error;
- }
+ ut_ad(len);
+ ut_a(rec);
+ ut_a(n < rec_get_n_fields_old(rec));
if (rec_get_1byte_offs_flag(rec)) {
os = rec_1_get_field_start_offs(rec, n);
diff --git a/storage/innobase/row/row0ins.c b/storage/innobase/row/row0ins.c
index fe51fce82c4..906aaae2412 100644
--- a/storage/innobase/row/row0ins.c
+++ b/storage/innobase/row/row0ins.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -1965,7 +1965,7 @@ row_ins_index_entry_low(
que_thr_t* thr) /*!< in: query thread */
{
btr_cur_t cursor;
- ulint ignore_sec_unique = 0;
+ ulint search_mode;
ulint modify = 0; /* remove warning */
rec_t* insert_rec;
rec_t* rec;
@@ -1985,18 +1985,23 @@ row_ins_index_entry_low(
the function will return in both low_match and up_match of the
cursor sensible values */
- if (!(thr_get_trx(thr)->check_unique_secondary)) {
- ignore_sec_unique = BTR_IGNORE_SEC_UNIQUE;
+ if (dict_index_is_clust(index)) {
+ search_mode = mode;
+ } else if (!(thr_get_trx(thr)->check_unique_secondary)) {
+ search_mode = mode | BTR_INSERT | BTR_IGNORE_SEC_UNIQUE;
+ } else {
+ search_mode = mode | BTR_INSERT;
}
btr_cur_search_to_nth_level(index, 0, entry, PAGE_CUR_LE,
- mode | BTR_INSERT | ignore_sec_unique,
- &cursor, 0, &mtr);
+ search_mode,
+ &cursor, 0, __FILE__, __LINE__, &mtr);
if (cursor.flag == BTR_CUR_INSERT_TO_IBUF) {
/* The insertion was made to the insert buffer already during
the search: we are done */
+ ut_ad(search_mode & BTR_INSERT);
err = DB_SUCCESS;
goto function_exit;
@@ -2049,7 +2054,8 @@ row_ins_index_entry_low(
btr_cur_search_to_nth_level(index, 0, entry,
PAGE_CUR_LE,
mode | BTR_INSERT,
- &cursor, 0, &mtr);
+ &cursor, 0,
+ __FILE__, __LINE__, &mtr);
}
}
@@ -2104,7 +2110,8 @@ function_exit:
mtr_start(&mtr);
btr_cur_search_to_nth_level(index, 0, entry, PAGE_CUR_LE,
- BTR_MODIFY_TREE, &cursor, 0, &mtr);
+ BTR_MODIFY_TREE, &cursor, 0,
+ __FILE__, __LINE__, &mtr);
rec = btr_cur_get_rec(&cursor);
offsets = rec_get_offsets(rec, index, NULL,
ULINT_UNDEFINED, &heap);
diff --git a/storage/innobase/row/row0merge.c b/storage/innobase/row/row0merge.c
index 232211e5ce7..908d142c98f 100644
--- a/storage/innobase/row/row0merge.c
+++ b/storage/innobase/row/row0merge.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 2005, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2005, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -57,6 +57,11 @@ Completed by Sunny Bains and Marko Makela
#include "ut0sort.h"
#include "handler0alter.h"
+/* Ignore posix_fadvise() on those platforms where it does not exist */
+#if defined __WIN__
+# define posix_fadvise(fd, offset, len, advice) /* nothing */
+#endif /* __WIN__ */
+
#ifdef UNIV_DEBUG
/** Set these in order ot enable debug printout. */
/* @{ */
@@ -424,14 +429,13 @@ row_merge_dup_report(
row_merge_dup_t* dup, /*!< in/out: for reporting duplicates */
const dfield_t* entry) /*!< in: duplicate index entry */
{
- mrec_buf_t buf;
+ mrec_buf_t* buf;
const dtuple_t* tuple;
dtuple_t tuple_store;
const rec_t* rec;
const dict_index_t* index = dup->index;
ulint n_fields= dict_index_get_n_fields(index);
- mem_heap_t* heap = NULL;
- ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ mem_heap_t* heap;
ulint* offsets;
ulint n_ext;
@@ -441,22 +445,22 @@ row_merge_dup_report(
return;
}
- rec_offs_init(offsets_);
-
/* Convert the tuple to a record and then to MySQL format. */
+ heap = mem_heap_create((1 + REC_OFFS_HEADER_SIZE + n_fields)
+ * sizeof *offsets
+ + sizeof *buf);
+
+ buf = mem_heap_alloc(heap, sizeof *buf);
tuple = dtuple_from_fields(&tuple_store, entry, n_fields);
n_ext = dict_index_is_clust(index) ? dtuple_get_n_ext(tuple) : 0;
- rec = rec_convert_dtuple_to_rec(buf, index, tuple, n_ext);
- offsets = rec_get_offsets(rec, index, offsets_, ULINT_UNDEFINED,
- &heap);
+ rec = rec_convert_dtuple_to_rec(*buf, index, tuple, n_ext);
+ offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap);
innobase_rec_to_mysql(dup->table, rec, index, offsets);
- if (UNIV_LIKELY_NULL(heap)) {
- mem_heap_free(heap);
- }
+ mem_heap_free(heap);
}
/*************************************************************//**
@@ -627,22 +631,26 @@ row_merge_buf_write(
}
/******************************************************//**
-Create a memory heap and allocate space for row_merge_rec_offsets().
+Create a memory heap and allocate space for row_merge_rec_offsets()
+and mrec_buf_t[3].
@return memory heap */
static
mem_heap_t*
row_merge_heap_create(
/*==================*/
const dict_index_t* index, /*!< in: record descriptor */
+ mrec_buf_t** buf, /*!< out: 3 buffers */
ulint** offsets1, /*!< out: offsets */
ulint** offsets2) /*!< out: offsets */
{
ulint i = 1 + REC_OFFS_HEADER_SIZE
+ dict_index_get_n_fields(index);
- mem_heap_t* heap = mem_heap_create(2 * i * sizeof *offsets1);
+ mem_heap_t* heap = mem_heap_create(2 * i * sizeof **offsets1
+ + 3 * sizeof **buf);
- *offsets1 = mem_heap_alloc(heap, i * sizeof *offsets1);
- *offsets2 = mem_heap_alloc(heap, i * sizeof *offsets2);
+ *buf = mem_heap_alloc(heap, 3 * sizeof **buf);
+ *offsets1 = mem_heap_alloc(heap, i * sizeof **offsets1);
+ *offsets2 = mem_heap_alloc(heap, i * sizeof **offsets2);
(*offsets1)[0] = (*offsets2)[0] = i;
(*offsets1)[1] = (*offsets2)[1] = dict_index_get_n_fields(index);
@@ -704,6 +712,11 @@ row_merge_read(
(ulint) (ofs & 0xFFFFFFFF),
(ulint) (ofs >> 32),
sizeof *buf);
+#ifdef POSIX_FADV_DONTNEED
+ /* Each block is read exactly once. Free up the file cache. */
+ posix_fadvise(fd, ofs, sizeof *buf, POSIX_FADV_DONTNEED);
+#endif /* POSIX_FADV_DONTNEED */
+
if (UNIV_UNLIKELY(!success)) {
ut_print_timestamp(stderr);
fprintf(stderr,
@@ -734,6 +747,12 @@ row_merge_write(
}
#endif /* UNIV_DEBUG */
+#ifdef POSIX_FADV_DONTNEED
+ /* The block will be needed on the next merge pass,
+ but it can be evicted from the file cache meanwhile. */
+ posix_fadvise(fd, ofs, sizeof *buf, POSIX_FADV_DONTNEED);
+#endif /* POSIX_FADV_DONTNEED */
+
return(UNIV_LIKELY(os_file_write("(merge)", OS_FILE_FROM_FD(fd), buf,
(ulint) (ofs & 0xFFFFFFFF),
(ulint) (ofs >> 32),
@@ -1394,7 +1413,8 @@ row_merge_blocks(
{
mem_heap_t* heap; /*!< memory heap for offsets0, offsets1 */
- mrec_buf_t buf[3]; /*!< buffer for handling split mrec in block[] */
+ mrec_buf_t* buf; /*!< buffer for handling
+ split mrec in block[] */
const byte* b0; /*!< pointer to block[0] */
const byte* b1; /*!< pointer to block[1] */
byte* b2; /*!< pointer to block[2] */
@@ -1414,7 +1434,7 @@ row_merge_blocks(
}
#endif /* UNIV_DEBUG */
- heap = row_merge_heap_create(index, &offsets0, &offsets1);
+ heap = row_merge_heap_create(index, &buf, &offsets0, &offsets1);
/* Write a record and read the next record. Split the output
file in two halves, which can be merged on the following pass. */
@@ -1500,7 +1520,7 @@ row_merge_blocks_copy(
{
mem_heap_t* heap; /*!< memory heap for offsets0, offsets1 */
- mrec_buf_t buf[3]; /*!< buffer for handling
+ mrec_buf_t* buf; /*!< buffer for handling
split mrec in block[] */
const byte* b0; /*!< pointer to block[0] */
byte* b2; /*!< pointer to block[2] */
@@ -1518,7 +1538,7 @@ row_merge_blocks_copy(
}
#endif /* UNIV_DEBUG */
- heap = row_merge_heap_create(index, &offsets0, &offsets1);
+ heap = row_merge_heap_create(index, &buf, &offsets0, &offsets1);
/* Write a record and read the next record. Split the output
file in two halves, which can be merged on the following pass. */
@@ -1589,6 +1609,14 @@ row_merge(
of.offset = 0;
of.n_rec = 0;
+#ifdef POSIX_FADV_SEQUENTIAL
+ /* The input file will be read sequentially, starting from the
+ beginning and the middle. In Linux, the POSIX_FADV_SEQUENTIAL
+ affects the entire file. Each block will be read exactly once. */
+ posix_fadvise(file->fd, 0, 0,
+ POSIX_FADV_SEQUENTIAL | POSIX_FADV_NOREUSE);
+#endif /* POSIX_FADV_SEQUENTIAL */
+
/* Merge blocks to the output file. */
ohalf = 0;
foffs0 = 0;
@@ -1760,7 +1788,6 @@ row_merge_insert_index_tuples(
int fd, /*!< in: file descriptor */
row_merge_block_t* block) /*!< in/out: file buffer */
{
- mrec_buf_t buf;
const byte* b;
que_thr_t* thr;
ins_node_t* node;
@@ -1779,7 +1806,7 @@ row_merge_insert_index_tuples(
trx->op_info = "inserting index entries";
- graph_heap = mem_heap_create(500);
+ graph_heap = mem_heap_create(500 + sizeof(mrec_buf_t));
node = ins_node_create(INS_DIRECT, table, graph_heap);
thr = pars_complete_graph_for_exec(node, trx, graph_heap);
@@ -1801,12 +1828,14 @@ row_merge_insert_index_tuples(
if (!row_merge_read(fd, foffs, block)) {
error = DB_CORRUPTION;
} else {
+ mrec_buf_t* buf = mem_heap_alloc(graph_heap, sizeof *buf);
+
for (;;) {
const mrec_t* mrec;
dtuple_t* dtuple;
ulint n_ext;
- b = row_merge_read_rec(block, &buf, b, index,
+ b = row_merge_read_rec(block, buf, b, index,
fd, &foffs, &mrec, offsets);
if (UNIV_UNLIKELY(!b)) {
/* End of list, or I/O error */
@@ -1977,14 +2006,12 @@ row_merge_drop_index(
/* Drop the field definitions of the index. */
"DELETE FROM SYS_FIELDS WHERE INDEX_ID = :indexid;\n"
/* Drop the index definition and the B-tree. */
- "DELETE FROM SYS_INDEXES WHERE ID = :indexid\n"
- " AND TABLE_ID = :tableid;\n"
+ "DELETE FROM SYS_INDEXES WHERE ID = :indexid;\n"
"END;\n";
ut_ad(index && table && trx);
pars_info_add_dulint_literal(info, "indexid", index->id);
- pars_info_add_dulint_literal(info, "tableid", table->id);
trx_start_if_not_started(trx);
trx->op_info = "dropping index";
@@ -2033,47 +2060,79 @@ row_merge_drop_temp_indexes(void)
/*=============================*/
{
trx_t* trx;
- ulint err;
-
- /* We use the private SQL parser of Innobase to generate the
- query graphs needed in deleting the dictionary data from system
- tables in Innobase. Deleting a row from SYS_INDEXES table also
- frees the file segments of the B-tree associated with the index. */
- static const char drop_temp_indexes[] =
- "PROCEDURE DROP_TEMP_INDEXES_PROC () IS\n"
- "indexid CHAR;\n"
- "DECLARE CURSOR c IS SELECT ID FROM SYS_INDEXES\n"
- "WHERE SUBSTR(NAME,0,1)='" TEMP_INDEX_PREFIX_STR "';\n"
- "BEGIN\n"
- "\tOPEN c;\n"
- "\tWHILE 1=1 LOOP\n"
- "\t\tFETCH c INTO indexid;\n"
- "\t\tIF (SQL % NOTFOUND) THEN\n"
- "\t\t\tEXIT;\n"
- "\t\tEND IF;\n"
- "\t\tDELETE FROM SYS_FIELDS WHERE INDEX_ID = indexid;\n"
- "\t\tDELETE FROM SYS_INDEXES WHERE ID = indexid;\n"
- "\tEND LOOP;\n"
- "\tCLOSE c;\n"
- "\tCOMMIT WORK;\n"
- "END;\n";
+ btr_pcur_t pcur;
+ mtr_t mtr;
+ /* Load the table definitions that contain partially defined
+ indexes, so that the data dictionary information can be checked
+ when accessing the tablename.ibd files. */
trx = trx_allocate_for_background();
trx->op_info = "dropping partially created indexes";
row_mysql_lock_data_dictionary(trx);
- /* Incomplete transactions may be holding some locks on the
- data dictionary tables. However, they should never have been
- able to lock the records corresponding to the partially
- created indexes that we are attempting to delete, because the
- table was locked when the indexes were being created. We will
- drop the partially created indexes before the rollback of
- incomplete transactions is initiated. Thus, this should not
- interfere with the incomplete transactions. */
- trx->isolation_level = TRX_ISO_READ_UNCOMMITTED;
- err = que_eval_sql(NULL, drop_temp_indexes, FALSE, trx);
- ut_a(err == DB_SUCCESS);
+ mtr_start(&mtr);
+
+ btr_pcur_open_at_index_side(
+ TRUE,
+ dict_table_get_first_index(dict_sys->sys_indexes),
+ BTR_SEARCH_LEAF, &pcur, TRUE, &mtr);
+
+ for (;;) {
+ const rec_t* rec;
+ const byte* field;
+ ulint len;
+ dulint table_id;
+ dict_table_t* table;
+
+ btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+
+ if (!btr_pcur_is_on_user_rec(&pcur)) {
+ break;
+ }
+
+ rec = btr_pcur_get_rec(&pcur);
+ field = rec_get_nth_field_old(rec, DICT_SYS_INDEXES_NAME_FIELD,
+ &len);
+ if (len == UNIV_SQL_NULL || len == 0
+ || mach_read_from_1(field) != (ulint) TEMP_INDEX_PREFIX) {
+ continue;
+ }
+
+ /* This is a temporary index. */
+
+ field = rec_get_nth_field_old(rec, 0/*TABLE_ID*/, &len);
+ if (len != 8) {
+ /* Corrupted TABLE_ID */
+ continue;
+ }
+
+ table_id = mach_read_from_8(field);
+ btr_pcur_store_position(&pcur, &mtr);
+ btr_pcur_commit_specify_mtr(&pcur, &mtr);
+
+ table = dict_load_table_on_id(table_id);
+
+ if (table) {
+ dict_index_t* index;
+
+ for (index = dict_table_get_first_index(table);
+ index; index = dict_table_get_next_index(index)) {
+
+ if (*index->name == TEMP_INDEX_PREFIX) {
+ row_merge_drop_index(index, table, trx);
+ trx_commit_for_mysql(trx);
+ }
+ }
+ }
+
+ mtr_start(&mtr);
+ btr_pcur_restore_position(BTR_SEARCH_LEAF,
+ &pcur, &mtr);
+ }
+
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
row_mysql_unlock_data_dictionary(trx);
trx_free_for_background(trx);
}
@@ -2086,9 +2145,22 @@ row_merge_file_create(
/*==================*/
merge_file_t* merge_file) /*!< out: merge file structure */
{
+#ifdef UNIV_PFS_IO
+ /* This temp file open does not go through normal
+ file APIs, add instrumentation to register with
+ performance schema */
+ struct PSI_file_locker* locker = NULL;
+ register_pfs_file_open_begin(locker, innodb_file_temp_key,
+ PSI_FILE_OPEN,
+ "Innodb Merge Temp File",
+ __FILE__, __LINE__);
+#endif
merge_file->fd = innobase_mysql_tmpfile();
merge_file->offset = 0;
merge_file->n_rec = 0;
+#ifdef UNIV_PFS_IO
+ register_pfs_file_open_end(locker, merge_file->fd);
+#endif
}
/*********************************************************************//**
@@ -2099,10 +2171,19 @@ row_merge_file_destroy(
/*===================*/
merge_file_t* merge_file) /*!< out: merge file structure */
{
+#ifdef UNIV_PFS_IO
+ struct PSI_file_locker* locker = NULL;
+ register_pfs_file_io_begin(locker, merge_file->fd, 0, PSI_FILE_CLOSE,
+ __FILE__, __LINE__);
+#endif
if (merge_file->fd != -1) {
close(merge_file->fd);
merge_file->fd = -1;
}
+
+#ifdef UNIV_PFS_IO
+ register_pfs_file_io_end(locker, 0);
+#endif
}
/*********************************************************************//**
diff --git a/storage/innobase/row/row0mysql.c b/storage/innobase/row/row0mysql.c
index 181c39de881..0d8d298453c 100644
--- a/storage/innobase/row/row0mysql.c
+++ b/storage/innobase/row/row0mysql.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 2000, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2000, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -485,7 +485,7 @@ next_column:
/****************************************************************//**
Handles user errors and lock waits detected by the database engine.
@return TRUE if it was a lock wait and we should continue running the
-query thread */
+query thread and in that case the thr is ALREADY in the running state. */
UNIV_INTERN
ibool
row_mysql_handle_errors(
@@ -3255,19 +3255,13 @@ check_next_foreign:
"END;\n"
, FALSE, trx);
- if (err != DB_SUCCESS) {
- ut_a(err == DB_OUT_OF_FILE_SPACE);
-
- err = DB_MUST_GET_MORE_FILE_SPACE;
-
- row_mysql_handle_errors(&err, trx, NULL, NULL);
-
- ut_error;
- } else {
- ibool is_path;
+ switch (err) {
+ ibool is_temp;
const char* name_or_path;
mem_heap_t* heap;
+ case DB_SUCCESS:
+
heap = mem_heap_create(200);
/* Clone the name, in case it has been allocated
@@ -3277,12 +3271,13 @@ check_next_foreign:
space_id = table->space;
if (table->dir_path_of_temp_table != NULL) {
- is_path = TRUE;
name_or_path = mem_heap_strdup(
heap, table->dir_path_of_temp_table);
+ is_temp = TRUE;
} else {
- is_path = FALSE;
name_or_path = name;
+ is_temp = (table->flags >> DICT_TF2_SHIFT)
+ & DICT_TF2_TEMPORARY;
}
dict_table_remove_from_cache(table);
@@ -3302,8 +3297,8 @@ check_next_foreign:
if (err == DB_SUCCESS && space_id > 0) {
if (!fil_space_for_table_exists_in_mem(space_id,
name_or_path,
- is_path,
- FALSE, TRUE)) {
+ is_temp, FALSE,
+ !is_temp)) {
err = DB_SUCCESS;
fprintf(stderr,
@@ -3332,7 +3327,27 @@ check_next_foreign:
}
mem_heap_free(heap);
+ break;
+
+ case DB_TOO_MANY_CONCURRENT_TRXS:
+ /* Cannot even find a free slot for the
+ the undo log. We can directly exit here
+ and return the DB_TOO_MANY_CONCURRENT_TRXS
+ error. */
+ break;
+
+ case DB_OUT_OF_FILE_SPACE:
+ err = DB_MUST_GET_MORE_FILE_SPACE;
+
+ row_mysql_handle_errors(&err, trx, NULL, NULL);
+
+ /* Fall through to raise error */
+
+ default:
+ /* No other possible error returns */
+ ut_error;
}
+
funct_exit:
if (locked_dictionary) {
@@ -3348,6 +3363,90 @@ funct_exit:
return((int) err);
}
+/*********************************************************************//**
+Drop all temporary tables during crash recovery. */
+UNIV_INTERN
+void
+row_mysql_drop_temp_tables(void)
+/*============================*/
+{
+ trx_t* trx;
+ btr_pcur_t pcur;
+ mtr_t mtr;
+ mem_heap_t* heap;
+
+ trx = trx_allocate_for_background();
+ trx->op_info = "dropping temporary tables";
+ row_mysql_lock_data_dictionary(trx);
+
+ heap = mem_heap_create(200);
+
+ mtr_start(&mtr);
+
+ btr_pcur_open_at_index_side(
+ TRUE,
+ dict_table_get_first_index(dict_sys->sys_tables),
+ BTR_SEARCH_LEAF, &pcur, TRUE, &mtr);
+
+ for (;;) {
+ const rec_t* rec;
+ const byte* field;
+ ulint len;
+ const char* table_name;
+ dict_table_t* table;
+
+ btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+
+ if (!btr_pcur_is_on_user_rec(&pcur)) {
+ break;
+ }
+
+ rec = btr_pcur_get_rec(&pcur);
+ field = rec_get_nth_field_old(rec, 4/*N_COLS*/, &len);
+ if (len != 4 || !(mach_read_from_4(field) & 0x80000000UL)) {
+ continue;
+ }
+
+ /* Because this is not a ROW_FORMAT=REDUNDANT table,
+ the is_temp flag is valid. Examine it. */
+
+ field = rec_get_nth_field_old(rec, 7/*MIX_LEN*/, &len);
+ if (len != 4
+ || !(mach_read_from_4(field) & DICT_TF2_TEMPORARY)) {
+ continue;
+ }
+
+ /* This is a temporary table. */
+ field = rec_get_nth_field_old(rec, 0/*NAME*/, &len);
+ if (len == UNIV_SQL_NULL || len == 0) {
+ /* Corrupted SYS_TABLES.NAME */
+ continue;
+ }
+
+ table_name = mem_heap_strdupl(heap, (const char*) field, len);
+
+ btr_pcur_store_position(&pcur, &mtr);
+ btr_pcur_commit_specify_mtr(&pcur, &mtr);
+
+ table = dict_load_table(table_name);
+
+ if (table) {
+ row_drop_table_for_mysql(table_name, trx, FALSE);
+ trx_commit_for_mysql(trx);
+ }
+
+ mtr_start(&mtr);
+ btr_pcur_restore_position(BTR_SEARCH_LEAF,
+ &pcur, &mtr);
+ }
+
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+ mem_heap_free(heap);
+ row_mysql_unlock_data_dictionary(trx);
+ trx_free_for_background(trx);
+}
+
/*******************************************************************//**
Drop all foreign keys in a database, see Bug#18942.
Called at the end of row_drop_database_for_mysql().
@@ -3899,14 +3998,15 @@ Checks that the index contains entries in an ascending order, unique
constraint is not broken, and calculates the number of index entries
in the read view of the current transaction.
@return TRUE if ok */
-static
+UNIV_INTERN
ibool
-row_scan_and_check_index(
-/*=====================*/
- row_prebuilt_t* prebuilt, /*!< in: prebuilt struct in MySQL */
- dict_index_t* index, /*!< in: index */
- ulint* n_rows) /*!< out: number of entries seen in the
- current consistent read */
+row_check_index_for_mysql(
+/*======================*/
+ row_prebuilt_t* prebuilt, /*!< in: prebuilt struct
+ in MySQL handle */
+ const dict_index_t* index, /*!< in: index */
+ ulint* n_rows) /*!< out: number of entries
+ seen in the consistent read */
{
dtuple_t* prev_entry = NULL;
ulint matched_fields;
@@ -3927,31 +4027,9 @@ row_scan_and_check_index(
*n_rows = 0;
- if (!row_merge_is_index_usable(prebuilt->trx, index)) {
- /* A newly created index may lack some delete-marked
- records that may exist in the read view of
- prebuilt->trx. Thus, such indexes must not be
- accessed by consistent read. */
- return(is_ok);
- }
-
buf = mem_alloc(UNIV_PAGE_SIZE);
heap = mem_heap_create(100);
- /* Make a dummy template in prebuilt, which we will use
- in scanning the index entries */
-
- prebuilt->index = index;
- /* row_merge_is_index_usable() was already checked above. */
- prebuilt->index_usable = TRUE;
- prebuilt->sql_stat_start = TRUE;
- prebuilt->template_type = ROW_MYSQL_DUMMY_TEMPLATE;
- prebuilt->n_template = 0;
- prebuilt->need_to_access_clustered = FALSE;
-
- dtuple_set_n_fields(prebuilt->search_tuple, 0);
-
- prebuilt->select_lock_type = LOCK_NONE;
cnt = 1000;
ret = row_search_for_mysql(buf, PAGE_CUR_G, prebuilt, 0, 0);
@@ -4070,119 +4148,6 @@ not_ok:
}
/*********************************************************************//**
-Checks a table for corruption.
-@return DB_ERROR or DB_SUCCESS */
-UNIV_INTERN
-ulint
-row_check_table_for_mysql(
-/*======================*/
- row_prebuilt_t* prebuilt) /*!< in: prebuilt struct in MySQL
- handle */
-{
- dict_table_t* table = prebuilt->table;
- dict_index_t* index;
- ulint n_rows;
- ulint n_rows_in_table = ULINT_UNDEFINED;
- ulint ret = DB_SUCCESS;
- ulint old_isolation_level;
-
- if (table->ibd_file_missing) {
- ut_print_timestamp(stderr);
- fprintf(stderr, " InnoDB: Error:\n"
- "InnoDB: MySQL is trying to use a table handle"
- " but the .ibd file for\n"
- "InnoDB: table %s does not exist.\n"
- "InnoDB: Have you deleted the .ibd file"
- " from the database directory under\n"
- "InnoDB: the MySQL datadir, or have you"
- " used DISCARD TABLESPACE?\n"
- "InnoDB: Look from\n"
- "InnoDB: " REFMAN "innodb-troubleshooting.html\n"
- "InnoDB: how you can resolve the problem.\n",
- table->name);
- return(DB_ERROR);
- }
-
- prebuilt->trx->op_info = "checking table";
-
- old_isolation_level = prebuilt->trx->isolation_level;
-
- /* We must run the index record counts at an isolation level
- >= READ COMMITTED, because a dirty read can see a wrong number
- of records in some index; to play safe, we use always
- REPEATABLE READ here */
-
- prebuilt->trx->isolation_level = TRX_ISO_REPEATABLE_READ;
-
- /* Enlarge the fatal lock wait timeout during CHECK TABLE. */
- mutex_enter(&kernel_mutex);
- srv_fatal_semaphore_wait_threshold += 7200; /* 2 hours */
- mutex_exit(&kernel_mutex);
-
- index = dict_table_get_first_index(table);
-
- while (index != NULL) {
- /* fputs("Validating index ", stderr);
- ut_print_name(stderr, trx, FALSE, index->name);
- putc('\n', stderr); */
-
- if (!btr_validate_index(index, prebuilt->trx)) {
- ret = DB_ERROR;
- } else {
- if (!row_scan_and_check_index(prebuilt,index, &n_rows)){
- ret = DB_ERROR;
- }
-
- if (trx_is_interrupted(prebuilt->trx)) {
- ret = DB_INTERRUPTED;
- break;
- }
-
- /* fprintf(stderr, "%lu entries in index %s\n", n_rows,
- index->name); */
-
- if (index == dict_table_get_first_index(table)) {
- n_rows_in_table = n_rows;
- } else if (n_rows != n_rows_in_table) {
-
- ret = DB_ERROR;
-
- fputs("Error: ", stderr);
- dict_index_name_print(stderr,
- prebuilt->trx, index);
- fprintf(stderr,
- " contains %lu entries,"
- " should be %lu\n",
- (ulong) n_rows,
- (ulong) n_rows_in_table);
- }
- }
-
- index = dict_table_get_next_index(index);
- }
-
- /* Restore the original isolation level */
- prebuilt->trx->isolation_level = old_isolation_level;
-
- /* We validate also the whole adaptive hash index for all tables
- at every CHECK TABLE */
-
- if (!btr_search_validate()) {
-
- ret = DB_ERROR;
- }
-
- /* Restore the fatal lock wait timeout after CHECK TABLE. */
- mutex_enter(&kernel_mutex);
- srv_fatal_semaphore_wait_threshold -= 7200; /* 2 hours */
- mutex_exit(&kernel_mutex);
-
- prebuilt->trx->op_info = "";
-
- return(ret);
-}
-
-/*********************************************************************//**
Determines if a table is a magic monitor table.
@return TRUE if monitor table */
UNIV_INTERN
diff --git a/storage/innobase/row/row0purge.c b/storage/innobase/row/row0purge.c
index 500ebe571ab..da9d31f333f 100644
--- a/storage/innobase/row/row0purge.c
+++ b/storage/innobase/row/row0purge.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -215,33 +215,71 @@ retry:
}
/***********************************************************//**
-Removes a secondary index entry if possible.
+Determines if it is possible to remove a secondary index entry.
+Removal is possible if the secondary index entry does not refer to any
+not delete marked version of a clustered index record where DB_TRX_ID
+is newer than the purge view.
+
+NOTE: This function should only be called by the purge thread, only
+while holding a latch on the leaf page of the secondary index entry
+(or keeping the buffer pool watch on the page). It is possible that
+this function first returns TRUE and then FALSE, if a user transaction
+inserts a record that the secondary index entry would refer to.
+However, in that case, the user transaction would also re-insert the
+secondary index entry after purge has removed it and released the leaf
+page latch.
+@return TRUE if the secondary index record can be purged */
+UNIV_INTERN
+ibool
+row_purge_poss_sec(
+/*===============*/
+ purge_node_t* node, /*!< in/out: row purge node */
+ dict_index_t* index, /*!< in: secondary index */
+ const dtuple_t* entry) /*!< in: secondary index entry */
+{
+ ibool can_delete;
+ mtr_t mtr;
+
+ ut_ad(!dict_index_is_clust(index));
+ mtr_start(&mtr);
+
+ can_delete = !row_purge_reposition_pcur(BTR_SEARCH_LEAF, node, &mtr)
+ || !row_vers_old_has_index_entry(TRUE,
+ btr_pcur_get_rec(&node->pcur),
+ &mtr, index, entry);
+
+ btr_pcur_commit_specify_mtr(&node->pcur, &mtr);
+
+ return(can_delete);
+}
+
+/***************************************************************
+Removes a secondary index entry if possible, by modifying the
+index tree. Does not try to buffer the delete.
@return TRUE if success or if not found */
static
ibool
-row_purge_remove_sec_if_poss_low(
-/*=============================*/
+row_purge_remove_sec_if_poss_tree(
+/*==============================*/
purge_node_t* node, /*!< in: row purge node */
dict_index_t* index, /*!< in: index */
- const dtuple_t* entry, /*!< in: index entry */
- ulint mode) /*!< in: latch mode BTR_MODIFY_LEAF or
- BTR_MODIFY_TREE */
+ const dtuple_t* entry) /*!< in: index entry */
{
- btr_pcur_t pcur;
- btr_cur_t* btr_cur;
- ibool success;
- ibool old_has = 0; /* remove warning */
- ibool found;
- ulint err;
- mtr_t mtr;
- mtr_t mtr_vers;
+ btr_pcur_t pcur;
+ btr_cur_t* btr_cur;
+ ibool success = TRUE;
+ ulint err;
+ mtr_t mtr;
+ enum row_search_result search_result;
log_free_check();
mtr_start(&mtr);
- found = row_search_index_entry(index, entry, mode, &pcur, &mtr);
+ search_result = row_search_index_entry(index, entry, BTR_MODIFY_TREE,
+ &pcur, &mtr);
- if (!found) {
+ switch (search_result) {
+ case ROW_NOT_FOUND:
/* Not found. This is a legitimate condition. In a
rollback, InnoDB will remove secondary recs that would
be purged anyway. Then the actual purge will not find
@@ -254,11 +292,15 @@ row_purge_remove_sec_if_poss_low(
/* fputs("PURGE:........sec entry not found\n", stderr); */
/* dtuple_print(stderr, entry); */
-
- btr_pcur_close(&pcur);
- mtr_commit(&mtr);
-
- return(TRUE);
+ goto func_exit;
+ case ROW_FOUND:
+ break;
+ case ROW_BUFFERED:
+ case ROW_NOT_DELETED_REF:
+ /* These are invalid outcomes, because the mode passed
+ to row_search_index_entry() did not include any of the
+ flags BTR_INSERT, BTR_DELETE, or BTR_DELETE_MARK. */
+ ut_error;
}
btr_cur = btr_pcur_get_btr_cur(&pcur);
@@ -267,38 +309,102 @@ row_purge_remove_sec_if_poss_low(
which cannot be purged yet, requires its existence. If some requires,
we should do nothing. */
- mtr_start(&mtr_vers);
-
- success = row_purge_reposition_pcur(BTR_SEARCH_LEAF, node, &mtr_vers);
-
- if (success) {
- old_has = row_vers_old_has_index_entry(
- TRUE, btr_pcur_get_rec(&(node->pcur)),
- &mtr_vers, index, entry);
- }
-
- btr_pcur_commit_specify_mtr(&(node->pcur), &mtr_vers);
+ if (row_purge_poss_sec(node, index, entry)) {
+ /* Remove the index record, which should have been
+ marked for deletion. */
+ ut_ad(REC_INFO_DELETED_FLAG
+ & rec_get_info_bits(btr_cur_get_rec(btr_cur),
+ dict_table_is_comp(index->table)));
- if (!success || !old_has) {
- /* Remove the index record */
-
- if (mode == BTR_MODIFY_LEAF) {
- success = btr_cur_optimistic_delete(btr_cur, &mtr);
- } else {
- ut_ad(mode == BTR_MODIFY_TREE);
- btr_cur_pessimistic_delete(&err, FALSE, btr_cur,
- RB_NONE, &mtr);
- success = err == DB_SUCCESS;
- ut_a(success || err == DB_OUT_OF_FILE_SPACE);
+ btr_cur_pessimistic_delete(&err, FALSE, btr_cur,
+ RB_NONE, &mtr);
+ switch (UNIV_EXPECT(err, DB_SUCCESS)) {
+ case DB_SUCCESS:
+ break;
+ case DB_OUT_OF_FILE_SPACE:
+ success = FALSE;
+ break;
+ default:
+ ut_error;
}
}
+func_exit:
btr_pcur_close(&pcur);
mtr_commit(&mtr);
return(success);
}
+/***************************************************************
+Removes a secondary index entry without modifying the index tree,
+if possible.
+@return TRUE if success or if not found */
+static
+ibool
+row_purge_remove_sec_if_poss_leaf(
+/*==============================*/
+ purge_node_t* node, /*!< in: row purge node */
+ dict_index_t* index, /*!< in: index */
+ const dtuple_t* entry) /*!< in: index entry */
+{
+ mtr_t mtr;
+ btr_pcur_t pcur;
+ enum row_search_result search_result;
+
+ log_free_check();
+
+ mtr_start(&mtr);
+
+ /* Set the purge node for the call to row_purge_poss_sec(). */
+ pcur.btr_cur.purge_node = node;
+ /* Set the query thread, so that ibuf_insert_low() will be
+ able to invoke thd_get_trx(). */
+ pcur.btr_cur.thr = que_node_get_parent(node);
+
+ search_result = row_search_index_entry(
+ index, entry, BTR_MODIFY_LEAF | BTR_DELETE, &pcur, &mtr);
+
+ switch (search_result) {
+ ibool success;
+ case ROW_FOUND:
+ /* Before attempting to purge a record, check
+ if it is safe to do so. */
+ if (row_purge_poss_sec(node, index, entry)) {
+ btr_cur_t* btr_cur = btr_pcur_get_btr_cur(&pcur);
+
+ /* Only delete-marked records should be purged. */
+ ut_ad(REC_INFO_DELETED_FLAG
+ & rec_get_info_bits(
+ btr_cur_get_rec(btr_cur),
+ dict_table_is_comp(index->table)));
+
+ if (!btr_cur_optimistic_delete(btr_cur, &mtr)) {
+
+ /* The index entry could not be deleted. */
+ success = FALSE;
+ goto func_exit;
+ }
+ }
+ /* fall through (the index entry is still needed,
+ or the deletion succeeded) */
+ case ROW_NOT_DELETED_REF:
+ /* The index entry is still needed. */
+ case ROW_BUFFERED:
+ /* The deletion was buffered. */
+ case ROW_NOT_FOUND:
+ /* The index entry does not exist, nothing to do. */
+ success = TRUE;
+ func_exit:
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+ return(success);
+ }
+
+ ut_error;
+ return(FALSE);
+}
+
/***********************************************************//**
Removes a secondary index entry if possible. */
UNIV_INLINE
@@ -314,15 +420,12 @@ row_purge_remove_sec_if_poss(
/* fputs("Purge: Removing secondary record\n", stderr); */
- success = row_purge_remove_sec_if_poss_low(node, index, entry,
- BTR_MODIFY_LEAF);
- if (success) {
+ if (row_purge_remove_sec_if_poss_leaf(node, index, entry)) {
return;
}
retry:
- success = row_purge_remove_sec_if_poss_low(node, index, entry,
- BTR_MODIFY_TREE);
+ success = row_purge_remove_sec_if_poss_tree(node, index, entry);
/* The delete operation may fail if we have little
file space left: TODO: easiest to crash the database
and restart with more file space */
diff --git a/storage/innobase/row/row0row.c b/storage/innobase/row/row0row.c
index 128ac3ba3e8..6cdfa410c15 100644
--- a/storage/innobase/row/row0row.c
+++ b/storage/innobase/row/row0row.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -730,9 +730,9 @@ row_get_clust_rec(
/***************************************************************//**
Searches an index record.
-@return TRUE if found */
+@return whether the record was found or buffered */
UNIV_INTERN
-ibool
+enum row_search_result
row_search_index_entry(
/*===================*/
dict_index_t* index, /*!< in: index */
@@ -749,13 +749,38 @@ row_search_index_entry(
ut_ad(dtuple_check_typed(entry));
btr_pcur_open(index, entry, PAGE_CUR_LE, mode, pcur, mtr);
+
+ switch (btr_pcur_get_btr_cur(pcur)->flag) {
+ case BTR_CUR_DELETE_REF:
+ ut_a(mode & BTR_DELETE);
+ return(ROW_NOT_DELETED_REF);
+
+ case BTR_CUR_DEL_MARK_IBUF:
+ case BTR_CUR_DELETE_IBUF:
+ case BTR_CUR_INSERT_TO_IBUF:
+ return(ROW_BUFFERED);
+
+ case BTR_CUR_HASH:
+ case BTR_CUR_HASH_FAIL:
+ case BTR_CUR_BINARY:
+ break;
+ }
+
low_match = btr_pcur_get_low_match(pcur);
rec = btr_pcur_get_rec(pcur);
n_fields = dtuple_get_n_fields(entry);
- return(!page_rec_is_infimum(rec) && low_match == n_fields);
+ if (page_rec_is_infimum(rec)) {
+
+ return(ROW_NOT_FOUND);
+ } else if (low_match != n_fields) {
+
+ return(ROW_NOT_FOUND);
+ }
+
+ return(ROW_FOUND);
}
#include <my_sys.h>
@@ -915,6 +940,10 @@ row_raw_format(
ret = row_raw_format_int(data, data_len, prtype,
buf, buf_size, &format_in_hex);
+ if (format_in_hex) {
+
+ goto format_in_hex;
+ }
break;
case DATA_CHAR:
case DATA_VARCHAR:
@@ -923,14 +952,15 @@ row_raw_format(
ret = row_raw_format_str(data, data_len, prtype,
buf, buf_size, &format_in_hex);
+ if (format_in_hex) {
+
+ goto format_in_hex;
+ }
+
break;
/* XXX support more data types */
default:
-
- format_in_hex = TRUE;
- }
-
- if (format_in_hex) {
+ format_in_hex:
if (UNIV_LIKELY(buf_size > 2)) {
diff --git a/storage/innobase/row/row0sel.c b/storage/innobase/row/row0sel.c
index 3ef9726588e..16d4f2f7bfd 100644
--- a/storage/innobase/row/row0sel.c
+++ b/storage/innobase/row/row0sel.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2010, Innobase Oy. All Rights Reserved.
Copyright (c) 2008, Google Inc.
Portions of this file contain modifications contributed and copyrighted by
@@ -132,7 +132,8 @@ index record.
NOTE: the comparison is NOT done as a binary comparison, but character
fields are compared with collation!
@return TRUE if the secondary record is equal to the corresponding
-fields in the clustered record, when compared with collation */
+fields in the clustered record, when compared with collation;
+FALSE if not equal or if the clustered record has been marked for deletion */
static
ibool
row_sel_sec_rec_is_for_clust_rec(
@@ -431,10 +432,6 @@ row_sel_fetch_columns(
data = rec_get_nth_field(rec, offsets,
field_no, &len);
- if (len == UNIV_SQL_NULL) {
- len = UNIV_SQL_NULL;
- }
-
needs_copy = column->copy_val;
}
@@ -2170,36 +2167,6 @@ row_fetch_print(
return((void*)42);
}
-/****************************************************************//**
-Callback function for fetch that stores an unsigned 4 byte integer to the
-location pointed. The column's type must be DATA_INT, DATA_UNSIGNED, length
-= 4.
-@return always returns NULL */
-UNIV_INTERN
-void*
-row_fetch_store_uint4(
-/*==================*/
- void* row, /*!< in: sel_node_t* */
- void* user_arg) /*!< in: data pointer */
-{
- sel_node_t* node = row;
- ib_uint32_t* val = user_arg;
- ulint tmp;
-
- dfield_t* dfield = que_node_get_val(node->select_list);
- const dtype_t* type = dfield_get_type(dfield);
- ulint len = dfield_get_len(dfield);
-
- ut_a(dtype_get_mtype(type) == DATA_INT);
- ut_a(dtype_get_prtype(type) & DATA_UNSIGNED);
- ut_a(len == 4);
-
- tmp = mach_read_from_4(dfield_get_data(dfield));
- *val = (ib_uint32_t) tmp;
-
- return(NULL);
-}
-
/***********************************************************//**
Prints a row in a select result.
@return query thread to run next or NULL */
@@ -2981,6 +2948,7 @@ row_sel_get_clust_rec_for_mysql(
if (clust_rec
&& (old_vers
+ || trx->isolation_level <= TRX_ISO_READ_UNCOMMITTED
|| rec_get_deleted_flag(rec, dict_table_is_comp(
sec_index->table)))
&& !row_sel_sec_rec_is_for_clust_rec(
@@ -3202,14 +3170,17 @@ row_sel_try_search_shortcut_for_mysql(
ut_ad(dict_index_is_clust(index));
ut_ad(!prebuilt->templ_contains_blob);
+#ifndef UNIV_SEARCH_DEBUG
btr_pcur_open_with_no_init(index, search_tuple, PAGE_CUR_GE,
BTR_SEARCH_LEAF, pcur,
-#ifndef UNIV_SEARCH_DEBUG
RW_S_LATCH,
-#else
+ mtr);
+#else /* UNIV_SEARCH_DEBUG */
+ btr_pcur_open_with_no_init(index, search_tuple, PAGE_CUR_GE,
+ BTR_SEARCH_LEAF, pcur,
0,
-#endif
mtr);
+#endif /* UNIV_SEARCH_DEBUG */
rec = btr_pcur_get_rec(pcur);
if (!page_rec_is_user_rec(rec)) {
@@ -4616,6 +4587,7 @@ row_search_autoinc_read_column(
dict_index_t* index, /*!< in: index to read from */
const rec_t* rec, /*!< in: current rec */
ulint col_no, /*!< in: column number */
+ ulint mtype, /*!< in: column main type */
ibool unsigned_type) /*!< in: signed or unsigned flag */
{
ulint len;
@@ -4632,10 +4604,26 @@ row_search_autoinc_read_column(
data = rec_get_nth_field(rec, offsets, col_no, &len);
ut_a(len != UNIV_SQL_NULL);
- ut_a(len <= sizeof value);
- /* we assume AUTOINC value cannot be negative */
- value = mach_read_int_type(data, len, unsigned_type);
+ switch (mtype) {
+ case DATA_INT:
+ ut_a(len <= sizeof value);
+ value = mach_read_int_type(data, len, unsigned_type);
+ break;
+
+ case DATA_FLOAT:
+ ut_a(len == sizeof(float));
+ value = (ib_uint64_t) mach_float_read(data);
+ break;
+
+ case DATA_DOUBLE:
+ ut_a(len == sizeof(double));
+ value = (ib_uint64_t) mach_double_read(data);
+ break;
+
+ default:
+ ut_error;
+ }
if (UNIV_LIKELY_NULL(heap)) {
mem_heap_free(heap);
@@ -4721,7 +4709,8 @@ row_search_max_autoinc(
dfield->col->prtype & DATA_UNSIGNED);
*value = row_search_autoinc_read_column(
- index, rec, i, unsigned_type);
+ index, rec, i,
+ dfield->col->mtype, unsigned_type);
}
}
diff --git a/storage/innobase/row/row0uins.c b/storage/innobase/row/row0uins.c
index 9f9c814f1a5..c35f1ef7a44 100644
--- a/storage/innobase/row/row0uins.c
+++ b/storage/innobase/row/row0uins.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -145,37 +145,39 @@ row_undo_ins_remove_sec_low(
dict_index_t* index, /*!< in: index */
dtuple_t* entry) /*!< in: index entry to remove */
{
- btr_pcur_t pcur;
- btr_cur_t* btr_cur;
- ibool found;
- ibool success;
- ulint err;
- mtr_t mtr;
+ btr_pcur_t pcur;
+ btr_cur_t* btr_cur;
+ ulint err;
+ mtr_t mtr;
+ enum row_search_result search_result;
log_free_check();
mtr_start(&mtr);
- found = row_search_index_entry(index, entry, mode, &pcur, &mtr);
-
btr_cur = btr_pcur_get_btr_cur(&pcur);
- if (!found) {
- /* Not found */
-
- btr_pcur_close(&pcur);
- mtr_commit(&mtr);
-
- return(DB_SUCCESS);
+ ut_ad(mode == BTR_MODIFY_TREE || mode == BTR_MODIFY_LEAF);
+
+ search_result = row_search_index_entry(index, entry, mode,
+ &pcur, &mtr);
+
+ switch (search_result) {
+ case ROW_NOT_FOUND:
+ err = DB_SUCCESS;
+ goto func_exit;
+ case ROW_FOUND:
+ break;
+ case ROW_BUFFERED:
+ case ROW_NOT_DELETED_REF:
+ /* These are invalid outcomes, because the mode passed
+ to row_search_index_entry() did not include any of the
+ flags BTR_INSERT, BTR_DELETE, or BTR_DELETE_MARK. */
+ ut_error;
}
if (mode == BTR_MODIFY_LEAF) {
- success = btr_cur_optimistic_delete(btr_cur, &mtr);
-
- if (success) {
- err = DB_SUCCESS;
- } else {
- err = DB_FAIL;
- }
+ err = btr_cur_optimistic_delete(btr_cur, &mtr)
+ ? DB_SUCCESS : DB_FAIL;
} else {
ut_ad(mode == BTR_MODIFY_TREE);
@@ -188,7 +190,7 @@ row_undo_ins_remove_sec_low(
btr_cur_pessimistic_delete(&err, FALSE, btr_cur,
RB_NORMAL, &mtr);
}
-
+func_exit:
btr_pcur_close(&pcur);
mtr_commit(&mtr);
diff --git a/storage/innobase/row/row0umod.c b/storage/innobase/row/row0umod.c
index 6be475d8c78..80f57870316 100644
--- a/storage/innobase/row/row0umod.c
+++ b/storage/innobase/row/row0umod.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -144,13 +144,17 @@ row_undo_mod_clust_low(
/***********************************************************//**
Removes a clustered index record after undo if possible.
+This is attempted when the record was inserted by updating a
+delete-marked record and there no longer exist transactions
+that would see the delete-marked record. In other words, we
+roll back the insert by purging the record.
@return DB_SUCCESS, DB_FAIL, or error code: we may run out of file space */
static
ulint
row_undo_mod_remove_clust_low(
/*==========================*/
undo_node_t* node, /*!< in: row undo node */
- que_thr_t* thr __attribute__((unused)), /*!< in: query thread */
+ que_thr_t* thr, /*!< in: query thread */
mtr_t* mtr, /*!< in: mtr */
ulint mode) /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */
{
@@ -159,6 +163,7 @@ row_undo_mod_remove_clust_low(
ulint err;
ibool success;
+ ut_ad(node->rec_type == TRX_UNDO_UPD_DEL_REC);
pcur = &(node->pcur);
btr_cur = btr_pcur_get_btr_cur(pcur);
@@ -190,11 +195,13 @@ row_undo_mod_remove_clust_low(
} else {
ut_ad(mode == BTR_MODIFY_TREE);
- /* Note that since this operation is analogous to purge,
- we can free also inherited externally stored fields:
- hence the RB_NONE in the call below */
+ /* This operation is analogous to purge, we can free also
+ inherited externally stored fields */
- btr_cur_pessimistic_delete(&err, FALSE, btr_cur, RB_NONE, mtr);
+ btr_cur_pessimistic_delete(&err, FALSE, btr_cur,
+ thr_is_recv(thr)
+ ? RB_RECOVERY_PURGE_REC
+ : RB_NONE, mtr);
/* The delete operation may fail if we have little
file space left: TODO: easiest to crash the database
@@ -307,23 +314,27 @@ row_undo_mod_del_mark_or_remove_sec_low(
ulint mode) /*!< in: latch mode BTR_MODIFY_LEAF or
BTR_MODIFY_TREE */
{
- ibool found;
- btr_pcur_t pcur;
- btr_cur_t* btr_cur;
- ibool success;
- ibool old_has;
- ulint err;
- mtr_t mtr;
- mtr_t mtr_vers;
+ btr_pcur_t pcur;
+ btr_cur_t* btr_cur;
+ ibool success;
+ ibool old_has;
+ ulint err;
+ mtr_t mtr;
+ mtr_t mtr_vers;
+ enum row_search_result search_result;
log_free_check();
mtr_start(&mtr);
- found = row_search_index_entry(index, entry, mode, &pcur, &mtr);
-
btr_cur = btr_pcur_get_btr_cur(&pcur);
- if (!found) {
+ ut_ad(mode == BTR_MODIFY_TREE || mode == BTR_MODIFY_LEAF);
+
+ search_result = row_search_index_entry(index, entry, mode,
+ &pcur, &mtr);
+
+ switch (UNIV_EXPECT(search_result, ROW_FOUND)) {
+ case ROW_NOT_FOUND:
/* In crash recovery, the secondary index record may
be missing if the UPDATE did not have time to insert
the secondary index records before the crash. When we
@@ -334,10 +345,16 @@ row_undo_mod_del_mark_or_remove_sec_low(
before it has inserted all updated secondary index
records, then the undo will not find those records. */
- btr_pcur_close(&pcur);
- mtr_commit(&mtr);
-
- return(DB_SUCCESS);
+ err = DB_SUCCESS;
+ goto func_exit;
+ case ROW_FOUND:
+ break;
+ case ROW_BUFFERED:
+ case ROW_NOT_DELETED_REF:
+ /* These are invalid outcomes, because the mode passed
+ to row_search_index_entry() did not include any of the
+ flags BTR_INSERT, BTR_DELETE, or BTR_DELETE_MARK. */
+ ut_error;
}
/* We should remove the index record if no prior version of the row,
@@ -370,10 +387,11 @@ row_undo_mod_del_mark_or_remove_sec_low(
} else {
ut_ad(mode == BTR_MODIFY_TREE);
- /* No need to distinguish RB_RECOVERY here, because we
- are deleting a secondary index record: the distinction
- between RB_NORMAL and RB_RECOVERY only matters when
- deleting a record that contains externally stored
+ /* No need to distinguish RB_RECOVERY_PURGE here,
+ because we are deleting a secondary index record:
+ the distinction between RB_NORMAL and
+ RB_RECOVERY_PURGE only matters when deleting a
+ record that contains externally stored
columns. */
ut_ad(!dict_index_is_clust(index));
btr_cur_pessimistic_delete(&err, FALSE, btr_cur,
@@ -386,6 +404,8 @@ row_undo_mod_del_mark_or_remove_sec_low(
}
btr_pcur_commit_specify_mtr(&(node->pcur), &mtr_vers);
+
+func_exit:
btr_pcur_close(&pcur);
mtr_commit(&mtr);
@@ -438,15 +458,17 @@ row_undo_mod_del_unmark_sec_and_undo_update(
BTR_MODIFY_TREE */
que_thr_t* thr, /*!< in: query thread */
dict_index_t* index, /*!< in: index */
- dtuple_t* entry) /*!< in: index entry */
+ const dtuple_t* entry) /*!< in: index entry */
{
- mem_heap_t* heap;
- btr_pcur_t pcur;
- upd_t* update;
- ulint err = DB_SUCCESS;
- big_rec_t* dummy_big_rec;
- mtr_t mtr;
- trx_t* trx = thr_get_trx(thr);
+ mem_heap_t* heap;
+ btr_pcur_t pcur;
+ btr_cur_t* btr_cur;
+ upd_t* update;
+ ulint err = DB_SUCCESS;
+ big_rec_t* dummy_big_rec;
+ mtr_t mtr;
+ trx_t* trx = thr_get_trx(thr);
+ enum row_search_result search_result;
/* Ignore indexes that are being created. */
if (UNIV_UNLIKELY(*index->name == TEMP_INDEX_PREFIX)) {
@@ -457,8 +479,19 @@ row_undo_mod_del_unmark_sec_and_undo_update(
log_free_check();
mtr_start(&mtr);
- if (UNIV_UNLIKELY(!row_search_index_entry(index, entry,
- mode, &pcur, &mtr))) {
+ ut_ad(mode == BTR_MODIFY_TREE || mode == BTR_MODIFY_LEAF);
+
+ search_result = row_search_index_entry(index, entry, mode,
+ &pcur, &mtr);
+
+ switch (search_result) {
+ case ROW_BUFFERED:
+ case ROW_NOT_DELETED_REF:
+ /* These are invalid outcomes, because the mode passed
+ to row_search_index_entry() did not include any of the
+ flags BTR_INSERT, BTR_DELETE, or BTR_DELETE_MARK. */
+ ut_error;
+ case ROW_NOT_FOUND:
fputs("InnoDB: error in sec index entry del undo in\n"
"InnoDB: ", stderr);
dict_index_name_print(stderr, trx, index);
@@ -473,9 +506,9 @@ row_undo_mod_del_unmark_sec_and_undo_update(
fputs("\n"
"InnoDB: Submit a detailed bug report"
" to http://bugs.mysql.com\n", stderr);
- } else {
- btr_cur_t* btr_cur = btr_pcur_get_btr_cur(&pcur);
-
+ break;
+ case ROW_FOUND:
+ btr_cur = btr_pcur_get_btr_cur(&pcur);
err = btr_cur_del_mark_set_sec_rec(BTR_NO_LOCKING_FLAG,
btr_cur, FALSE, thr, &mtr);
ut_a(err == DB_SUCCESS);
@@ -533,6 +566,7 @@ row_undo_mod_upd_del_sec(
dict_index_t* index;
ulint err = DB_SUCCESS;
+ ut_ad(node->rec_type == TRX_UNDO_UPD_DEL_REC);
heap = mem_heap_create(1024);
while (node->index != NULL) {
@@ -550,7 +584,7 @@ row_undo_mod_upd_del_sec(
does not exist. However, this situation may
only occur during the rollback of incomplete
transactions. */
- ut_a(trx_is_recv(thr_get_trx(thr)));
+ ut_a(thr_is_recv(thr));
} else {
err = row_undo_mod_del_mark_or_remove_sec(
node, thr, index, entry);
diff --git a/storage/innobase/row/row0upd.c b/storage/innobase/row/row0upd.c
index 58dfd43ead9..f1a90a3bf1c 100644
--- a/storage/innobase/row/row0upd.c
+++ b/storage/innobase/row/row0upd.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -1344,9 +1344,6 @@ row_upd_copy_columns(
data = rec_get_nth_field(rec, offsets,
column->field_nos[SYM_CLUST_FIELD_NO],
&len);
- if (len == UNIV_SQL_NULL) {
- len = UNIV_SQL_NULL;
- }
eval_node_copy_and_alloc_val(column, data, len);
column = UT_LIST_GET_NEXT(col_var_list, column);
@@ -1434,21 +1431,22 @@ row_upd_sec_index_entry(
upd_node_t* node, /*!< in: row update node */
que_thr_t* thr) /*!< in: query thread */
{
- ibool check_ref;
- ibool found;
- dict_index_t* index;
- dtuple_t* entry;
- btr_pcur_t pcur;
- btr_cur_t* btr_cur;
- mem_heap_t* heap;
- rec_t* rec;
- ulint err = DB_SUCCESS;
- mtr_t mtr;
- trx_t* trx = thr_get_trx(thr);
+ mtr_t mtr;
+ const rec_t* rec;
+ btr_pcur_t pcur;
+ mem_heap_t* heap;
+ dtuple_t* entry;
+ dict_index_t* index;
+ btr_cur_t* btr_cur;
+ ibool referenced;
+ ulint err = DB_SUCCESS;
+ trx_t* trx = thr_get_trx(thr);
+ ulint mode = BTR_MODIFY_LEAF;
+ enum row_search_result search_result;
index = node->index;
- check_ref = row_upd_index_is_referenced(index, trx);
+ referenced = row_upd_index_is_referenced(index, trx);
heap = mem_heap_create(1024);
@@ -1459,13 +1457,33 @@ row_upd_sec_index_entry(
log_free_check();
mtr_start(&mtr);
- found = row_search_index_entry(index, entry, BTR_MODIFY_LEAF, &pcur,
- &mtr);
+ /* Set the query thread, so that ibuf_insert_low() will be
+ able to invoke thd_get_trx(). */
+ btr_pcur_get_btr_cur(&pcur)->thr = thr;
+
+ /* We can only try to use the insert/delete buffer to buffer
+ delete-mark operations if the index we're modifying has no foreign
+ key constraints referring to it. */
+ if (!referenced) {
+ mode |= BTR_DELETE_MARK;
+ }
+
+ search_result = row_search_index_entry(index, entry, mode,
+ &pcur, &mtr);
+
btr_cur = btr_pcur_get_btr_cur(&pcur);
rec = btr_cur_get_rec(btr_cur);
- if (UNIV_UNLIKELY(!found)) {
+ switch (search_result) {
+ case ROW_NOT_DELETED_REF: /* should only occur for BTR_DELETE */
+ ut_error;
+ break;
+ case ROW_BUFFERED:
+ /* Entry was delete marked already. */
+ break;
+
+ case ROW_NOT_FOUND:
fputs("InnoDB: error in sec index entry update in\n"
"InnoDB: ", stderr);
dict_index_name_print(stderr, trx, index);
@@ -1482,20 +1500,26 @@ row_upd_sec_index_entry(
fputs("\n"
"InnoDB: Submit a detailed bug report"
" to http://bugs.mysql.com\n", stderr);
- } else {
+ break;
+ case ROW_FOUND:
/* Delete mark the old index record; it can already be
delete marked if we return after a lock wait in
row_ins_index_entry below */
- if (!rec_get_deleted_flag(rec,
- dict_table_is_comp(index->table))) {
- err = btr_cur_del_mark_set_sec_rec(0, btr_cur, TRUE,
- thr, &mtr);
- if (err == DB_SUCCESS && check_ref) {
+ if (!rec_get_deleted_flag(
+ rec, dict_table_is_comp(index->table))) {
+
+ err = btr_cur_del_mark_set_sec_rec(
+ 0, btr_cur, TRUE, thr, &mtr);
+
+ if (err == DB_SUCCESS && referenced) {
+
+ ulint* offsets;
+
+ offsets = rec_get_offsets(
+ rec, index, NULL, ULINT_UNDEFINED,
+ &heap);
- ulint* offsets = rec_get_offsets(
- rec, index, NULL,
- ULINT_UNDEFINED, &heap);
/* NOTE that the following call loses
the position of pcur ! */
err = row_upd_check_references_constraints(
@@ -1503,6 +1527,7 @@ row_upd_sec_index_entry(
index, offsets, thr, &mtr);
}
}
+ break;
}
btr_pcur_close(&pcur);
@@ -1566,7 +1591,7 @@ row_upd_clust_rec_by_insert(
upd_node_t* node, /*!< in: row update node */
dict_index_t* index, /*!< in: clustered index of the record */
que_thr_t* thr, /*!< in: query thread */
- ibool check_ref,/*!< in: TRUE if index may be referenced in
+ ibool referenced,/*!< in: TRUE if index may be referenced in
a foreign key constraint */
mtr_t* mtr) /*!< in: mtr; gets committed here */
{
@@ -1612,16 +1637,21 @@ row_upd_clust_rec_by_insert(
btr_cur_mark_extern_inherited_fields(
btr_cur_get_page_zip(btr_cur),
rec, index, offsets, node->update, mtr);
- if (check_ref) {
+ if (referenced) {
/* NOTE that the following call loses
the position of pcur ! */
+
err = row_upd_check_references_constraints(
node, pcur, table, index, offsets, thr, mtr);
+
if (err != DB_SUCCESS) {
+
mtr_commit(mtr);
+
if (UNIV_LIKELY_NULL(heap)) {
mem_heap_free(heap);
}
+
return(err);
}
}
@@ -1776,7 +1806,8 @@ row_upd_del_mark_clust_rec(
ulint* offsets,/*!< in/out: rec_get_offsets() for the
record under the cursor */
que_thr_t* thr, /*!< in: query thread */
- ibool check_ref,/*!< in: TRUE if index may be referenced in
+ ibool referenced,
+ /*!< in: TRUE if index may be referenced in
a foreign key constraint */
mtr_t* mtr) /*!< in: mtr; gets committed here */
{
@@ -1801,13 +1832,11 @@ row_upd_del_mark_clust_rec(
err = btr_cur_del_mark_set_clust_rec(BTR_NO_LOCKING_FLAG,
btr_cur, TRUE, thr, mtr);
- if (err == DB_SUCCESS && check_ref) {
+ if (err == DB_SUCCESS && referenced) {
/* NOTE that the following call loses the position of pcur ! */
- err = row_upd_check_references_constraints(node,
- pcur, index->table,
- index, offsets,
- thr, mtr);
+ err = row_upd_check_references_constraints(
+ node, pcur, index->table, index, offsets, thr, mtr);
}
mtr_commit(mtr);
@@ -1829,7 +1858,6 @@ row_upd_clust_step(
dict_index_t* index;
btr_pcur_t* pcur;
ibool success;
- ibool check_ref;
ulint err;
mtr_t* mtr;
mtr_t mtr_buf;
@@ -1837,11 +1865,12 @@ row_upd_clust_step(
mem_heap_t* heap = NULL;
ulint offsets_[REC_OFFS_NORMAL_SIZE];
ulint* offsets;
+ ibool referenced;
rec_offs_init(offsets_);
index = dict_table_get_first_index(node->table);
- check_ref = row_upd_index_is_referenced(index, thr_get_trx(thr));
+ referenced = row_upd_index_is_referenced(index, thr_get_trx(thr));
pcur = node->pcur;
@@ -1911,8 +1940,9 @@ row_upd_clust_step(
/* NOTE: the following function calls will also commit mtr */
if (node->is_delete) {
- err = row_upd_del_mark_clust_rec(node, index, offsets,
- thr, check_ref, mtr);
+ err = row_upd_del_mark_clust_rec(
+ node, index, offsets, thr, referenced, mtr);
+
if (err == DB_SUCCESS) {
node->state = UPD_NODE_UPDATE_ALL_SEC;
node->index = dict_table_get_next_index(index);
@@ -1960,8 +1990,9 @@ exit_func:
choosing records to update. MySQL solves now the problem
externally! */
- err = row_upd_clust_rec_by_insert(node, index, thr, check_ref,
- mtr);
+ err = row_upd_clust_rec_by_insert(
+ node, index, thr, referenced, mtr);
+
if (err != DB_SUCCESS) {
return(err);
diff --git a/storage/innobase/srv/srv0srv.c b/storage/innobase/srv/srv0srv.c
index 3b0e29b9b48..838df292bfc 100644
--- a/storage/innobase/srv/srv0srv.c
+++ b/storage/innobase/srv/srv0srv.c
@@ -1,7 +1,8 @@
/*****************************************************************************
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
Copyright (c) 2008, 2009 Google Inc.
+Copyright (c) 2009, Percona Inc.
Portions of this file contain modifications contributed and copyrighted by
Google, Inc. Those modifications are gratefully acknowledged and are described
@@ -9,6 +10,13 @@ briefly in the InnoDB documentation. The contributions by Google are
incorporated with their permission, and subject to the conditions contained in
the file COPYING.Google.
+Portions of this file contain modifications contributed and copyrighted
+by Percona Inc.. Those modifications are
+gratefully acknowledged and are described briefly in the InnoDB
+documentation. The contributions by Percona Inc. are incorporated with
+their permission, and subject to the conditions contained in the file
+COPYING.Percona.
+
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
Foundation; version 2 of the License.
@@ -22,32 +30,6 @@ this program; if not, write to the Free Software Foundation, Inc., 59 Temple
Place, Suite 330, Boston, MA 02111-1307 USA
*****************************************************************************/
-/***********************************************************************
-
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
-Copyright (c) 2009, Percona Inc.
-
-Portions of this file contain modifications contributed and copyrighted
-by Percona Inc.. Those modifications are
-gratefully acknowledged and are described briefly in the InnoDB
-documentation. The contributions by Percona Inc. are incorporated with
-their permission, and subject to the conditions contained in the file
-COPYING.Percona.
-
-This program is free software; you can redistribute it and/or modify it
-under the terms of the GNU General Public License as published by the
-Free Software Foundation; version 2 of the License.
-
-This program is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
-Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-
-***********************************************************************/
/**************************************************//**
@file srv/srv0srv.c
@@ -119,7 +101,8 @@ UNIV_INTERN ulint srv_fatal_semaphore_wait_threshold = 600;
in microseconds, in order to reduce the lagging of the purge thread. */
UNIV_INTERN ulint srv_dml_needed_delay = 0;
-UNIV_INTERN ibool srv_lock_timeout_and_monitor_active = FALSE;
+UNIV_INTERN ibool srv_lock_timeout_active = FALSE;
+UNIV_INTERN ibool srv_monitor_active = FALSE;
UNIV_INTERN ibool srv_error_monitor_active = FALSE;
UNIV_INTERN const char* srv_main_thread_op_info = "";
@@ -154,6 +137,12 @@ UNIV_INTERN ulint srv_check_file_format_at_startup = DICT_TF_FORMAT_MAX;
on duplicate key checking and foreign key checking */
UNIV_INTERN ibool srv_locks_unsafe_for_binlog = FALSE;
+/* If this flag is TRUE, then we will use the native aio of the
+OS (provided we compiled Innobase with it in), otherwise we will
+use simulated aio we build below with threads.
+Currently we support native aio on windows and linux */
+UNIV_INTERN my_bool srv_use_native_aio = TRUE;
+
UNIV_INTERN ulint srv_n_data_files = 0;
UNIV_INTERN char** srv_data_file_names = NULL;
/* size in database pages */
@@ -188,7 +177,17 @@ UNIV_INTERN ulong srv_flush_log_at_trx_commit = 1;
the checkpoints. */
UNIV_INTERN char srv_adaptive_flushing = TRUE;
-/* The sort order table of the MySQL latin1_swedish_ci character set
+/** Maximum number of times allowed to conditionally acquire
+mutex before switching to blocking wait on the mutex */
+#define MAX_MUTEX_NOWAIT 20
+
+/** Check whether the number of failed nonblocking mutex
+acquisition attempts exceeds maximum allowed value. If so,
+srv_printf_innodb_monitor() will request mutex acquisition
+with mutex_enter(), which will wait until it gets the mutex. */
+#define MUTEX_NOWAIT(mutex_skipped) ((mutex_skipped) < MAX_MUTEX_NOWAIT)
+
+/** The sort order table of the MySQL latin1_swedish_ci character set
collation */
UNIV_INTERN const byte* srv_latin1_ordering;
@@ -247,6 +246,12 @@ that during a time of heavy update/insert activity. */
UNIV_INTERN ulong srv_max_buf_pool_modified_pct = 75;
+/* the number of purge threads to use from the worker pool (currently 0 or 1).*/
+UNIV_INTERN ulint srv_n_purge_threads = 0;
+
+/* the number of records to purge in one batch */
+UNIV_INTERN ulint srv_purge_batch_size = 20;
+
/* variable counts amount of data read in total (in bytes) */
UNIV_INTERN ulint srv_data_read = 0;
@@ -424,6 +429,20 @@ UNIV_INTERN mutex_t srv_innodb_monitor_mutex;
/* Mutex for locking srv_monitor_file */
UNIV_INTERN mutex_t srv_monitor_file_mutex;
+
+#ifdef UNIV_PFS_MUTEX
+/* Key to register kernel_mutex with performance schema */
+UNIV_INTERN mysql_pfs_key_t kernel_mutex_key;
+/* Key to register srv_innodb_monitor_mutex with performance schema */
+UNIV_INTERN mysql_pfs_key_t srv_innodb_monitor_mutex_key;
+/* Key to register srv_monitor_file_mutex with performance schema */
+UNIV_INTERN mysql_pfs_key_t srv_monitor_file_mutex_key;
+/* Key to register srv_dict_tmpfile_mutex with performance schema */
+UNIV_INTERN mysql_pfs_key_t srv_dict_tmpfile_mutex_key;
+/* Key to register the mutex with performance schema */
+UNIV_INTERN mysql_pfs_key_t srv_misc_tmpfile_mutex_key;
+#endif /* UNIV_PFS_MUTEX */
+
/* Temporary file for innodb monitor output */
UNIV_INTERN FILE* srv_monitor_file;
/* Mutex for locking srv_dict_tmpfile.
@@ -691,6 +710,16 @@ are indexed by the type of the thread. */
UNIV_INTERN ulint srv_n_threads_active[SRV_MASTER + 1];
UNIV_INTERN ulint srv_n_threads[SRV_MASTER + 1];
+/*********************************************************************//**
+Asynchronous purge thread.
+@return a dummy parameter */
+UNIV_INTERN
+os_thread_ret_t
+srv_purge_thread(
+/*=============*/
+ void* arg __attribute__((unused))); /*!< in: a dummy parameter
+ required by os_thread_create */
+
/***********************************************************************
Prints counters for work done by srv_master_thread. */
static
@@ -939,9 +968,10 @@ srv_init(void)
srv_sys = mem_alloc(sizeof(srv_sys_t));
kernel_mutex_temp = mem_alloc(sizeof(mutex_t));
- mutex_create(&kernel_mutex, SYNC_KERNEL);
+ mutex_create(kernel_mutex_key, &kernel_mutex, SYNC_KERNEL);
- mutex_create(&srv_innodb_monitor_mutex, SYNC_NO_ORDER_CHECK);
+ mutex_create(srv_innodb_monitor_mutex_key,
+ &srv_innodb_monitor_mutex, SYNC_NO_ORDER_CHECK);
srv_sys->threads = mem_alloc(OS_THREAD_MAX_N * sizeof(srv_slot_t));
@@ -1613,8 +1643,9 @@ srv_suspend_mysql_thread(
innodb_lock_wait_timeout, because trx->mysql_thd == NULL. */
lock_wait_timeout = thd_lock_wait_timeout(trx->mysql_thd);
- if (lock_wait_timeout < 100000000
- && wait_time > (double) lock_wait_timeout) {
+ if (trx_is_interrupted(trx)
+ || (lock_wait_timeout < 100000000
+ && wait_time > (double) lock_wait_timeout)) {
trx->error_state = DB_LOCK_WAIT_TIMEOUT;
}
@@ -1680,12 +1711,15 @@ srv_refresh_innodb_monitor_stats(void)
}
/******************************************************************//**
-Outputs to a file the output of the InnoDB Monitor. */
+Outputs to a file the output of the InnoDB Monitor.
+@return FALSE if not all information printed
+due to failure to obtain necessary mutex */
UNIV_INTERN
-void
+ibool
srv_printf_innodb_monitor(
/*======================*/
FILE* file, /*!< in: output stream */
+ ibool nowait, /*!< in: whether to wait for kernel mutex */
ulint* trx_start, /*!< out: file position of the start of
the list of active transactions */
ulint* trx_end) /*!< out: file position of the end of
@@ -1694,6 +1728,7 @@ srv_printf_innodb_monitor(
double time_elapsed;
time_t current_time;
ulint n_reserved;
+ ibool ret;
mutex_enter(&srv_innodb_monitor_mutex);
@@ -1717,9 +1752,9 @@ srv_printf_innodb_monitor(
"Per second averages calculated from the last %lu seconds\n",
(ulong)time_elapsed);
- fputs("----------\n"
- "BACKGROUND THREAD\n"
- "----------\n", file);
+ fputs("-----------------\n"
+ "BACKGROUND THREAD\n"
+ "-----------------\n", file);
srv_print_master_thread_info(file);
fputs("----------\n"
@@ -1743,24 +1778,31 @@ srv_printf_innodb_monitor(
mutex_exit(&dict_foreign_err_mutex);
- lock_print_info_summary(file);
- if (trx_start) {
- long t = ftell(file);
- if (t < 0) {
- *trx_start = ULINT_UNDEFINED;
- } else {
- *trx_start = (ulint) t;
+ /* Only if lock_print_info_summary proceeds correctly,
+ before we call the lock_print_info_all_transactions
+ to print all the lock information. */
+ ret = lock_print_info_summary(file, nowait);
+
+ if (ret) {
+ if (trx_start) {
+ long t = ftell(file);
+ if (t < 0) {
+ *trx_start = ULINT_UNDEFINED;
+ } else {
+ *trx_start = (ulint) t;
+ }
}
- }
- lock_print_info_all_transactions(file);
- if (trx_end) {
- long t = ftell(file);
- if (t < 0) {
- *trx_end = ULINT_UNDEFINED;
- } else {
- *trx_end = (ulint) t;
+ lock_print_info_all_transactions(file);
+ if (trx_end) {
+ long t = ftell(file);
+ if (t < 0) {
+ *trx_end = ULINT_UNDEFINED;
+ } else {
+ *trx_end = (ulint) t;
+ }
}
}
+
fputs("--------\n"
"FILE I/O\n"
"--------\n", file);
@@ -1858,6 +1900,8 @@ srv_printf_innodb_monitor(
"============================\n", file);
mutex_exit(&srv_innodb_monitor_mutex);
fflush(file);
+
+ return(ret);
}
/******************************************************************//**
@@ -1945,43 +1989,47 @@ srv_export_innodb_status(void)
}
/*********************************************************************//**
-A thread which wakes up threads whose lock wait may have lasted too long.
-This also prints the info output by various InnoDB monitors.
+A thread which prints the info output by various InnoDB monitors.
@return a dummy parameter */
UNIV_INTERN
os_thread_ret_t
-srv_lock_timeout_and_monitor_thread(
-/*================================*/
+srv_monitor_thread(
+/*===============*/
void* arg __attribute__((unused)))
/*!< in: a dummy parameter required by
os_thread_create */
{
- srv_slot_t* slot;
double time_elapsed;
time_t current_time;
time_t last_table_monitor_time;
time_t last_tablespace_monitor_time;
time_t last_monitor_time;
- ibool some_waits;
- double wait_time;
- ulint i;
+ ulint mutex_skipped;
+ ibool last_srv_print_monitor;
#ifdef UNIV_DEBUG_THREAD_CREATION
fprintf(stderr, "Lock timeout thread starts, id %lu\n",
os_thread_pf(os_thread_get_curr_id()));
#endif
+
+#ifdef UNIV_PFS_THREAD
+ pfs_register_thread(srv_monitor_thread_key);
+#endif
+
UT_NOT_USED(arg);
srv_last_monitor_time = time(NULL);
last_table_monitor_time = time(NULL);
last_tablespace_monitor_time = time(NULL);
last_monitor_time = time(NULL);
+ mutex_skipped = 0;
+ last_srv_print_monitor = srv_print_innodb_monitor;
loop:
- srv_lock_timeout_and_monitor_active = TRUE;
+ srv_monitor_active = TRUE;
- /* When someone is waiting for a lock, we wake up every second
- and check if a timeout has passed for a lock wait */
+ /* Wake up every 5 seconds to see if we need to print
+ monitor information. */
- os_thread_sleep(1000000);
+ os_thread_sleep(5000000);
current_time = time(NULL);
@@ -1991,14 +2039,40 @@ loop:
last_monitor_time = time(NULL);
if (srv_print_innodb_monitor) {
- srv_printf_innodb_monitor(stderr, NULL, NULL);
+ /* Reset mutex_skipped counter everytime
+ srv_print_innodb_monitor changes. This is to
+ ensure we will not be blocked by kernel_mutex
+ for short duration information printing,
+ such as requested by sync_array_print_long_waits() */
+ if (!last_srv_print_monitor) {
+ mutex_skipped = 0;
+ last_srv_print_monitor = TRUE;
+ }
+
+ if (!srv_printf_innodb_monitor(stderr,
+ MUTEX_NOWAIT(mutex_skipped),
+ NULL, NULL)) {
+ mutex_skipped++;
+ } else {
+ /* Reset the counter */
+ mutex_skipped = 0;
+ }
+ } else {
+ last_srv_print_monitor = FALSE;
}
+
if (srv_innodb_status) {
mutex_enter(&srv_monitor_file_mutex);
rewind(srv_monitor_file);
- srv_printf_innodb_monitor(srv_monitor_file, NULL,
- NULL);
+ if (!srv_printf_innodb_monitor(srv_monitor_file,
+ MUTEX_NOWAIT(mutex_skipped),
+ NULL, NULL)) {
+ mutex_skipped++;
+ } else {
+ mutex_skipped = 0;
+ }
+
os_file_set_eof(srv_monitor_file);
mutex_exit(&srv_monitor_file_mutex);
}
@@ -2051,6 +2125,60 @@ loop:
}
}
+ if (srv_shutdown_state >= SRV_SHUTDOWN_CLEANUP) {
+ goto exit_func;
+ }
+
+ if (srv_print_innodb_monitor
+ || srv_print_innodb_lock_monitor
+ || srv_print_innodb_tablespace_monitor
+ || srv_print_innodb_table_monitor) {
+ goto loop;
+ }
+
+ srv_monitor_active = FALSE;
+
+ goto loop;
+
+exit_func:
+ srv_monitor_active = FALSE;
+
+ /* We count the number of threads in os_thread_exit(). A created
+ thread should always use that to exit and not use return() to exit. */
+
+ os_thread_exit(NULL);
+
+ OS_THREAD_DUMMY_RETURN;
+}
+
+/*********************************************************************//**
+A thread which wakes up threads whose lock wait may have lasted too long.
+@return a dummy parameter */
+UNIV_INTERN
+os_thread_ret_t
+srv_lock_timeout_thread(
+/*====================*/
+ void* arg __attribute__((unused)))
+ /* in: a dummy parameter required by
+ os_thread_create */
+{
+ srv_slot_t* slot;
+ ibool some_waits;
+ double wait_time;
+ ulint i;
+
+#ifdef UNIV_PFS_THREAD
+ pfs_register_thread(srv_lock_timeout_thread_key);
+#endif
+
+loop:
+ /* When someone is waiting for a lock, we wake up every second
+ and check if a timeout has passed for a lock wait */
+
+ os_thread_sleep(1000000);
+
+ srv_lock_timeout_active = TRUE;
+
mutex_enter(&kernel_mutex);
some_waits = FALSE;
@@ -2074,9 +2202,10 @@ loop:
lock_wait_timeout = thd_lock_wait_timeout(
trx->mysql_thd);
- if (lock_wait_timeout < 100000000
- && (wait_time > (double) lock_wait_timeout
- || wait_time < 0)) {
+ if (trx_is_interrupted(trx)
+ || (lock_wait_timeout < 100000000
+ && (wait_time > (double) lock_wait_timeout
+ || wait_time < 0))) {
/* Timeout exceeded or a wrap-around in system
time counter: cancel the lock request queued
@@ -2101,17 +2230,11 @@ loop:
goto exit_func;
}
- if (some_waits || srv_print_innodb_monitor
- || srv_print_innodb_lock_monitor
- || srv_print_innodb_tablespace_monitor
- || srv_print_innodb_table_monitor) {
+ if (some_waits) {
goto loop;
}
- /* No one was waiting for a lock and no monitor was active:
- suspend this thread */
-
- srv_lock_timeout_and_monitor_active = FALSE;
+ srv_lock_timeout_active = FALSE;
#if 0
/* The following synchronisation is disabled, since
@@ -2121,7 +2244,7 @@ loop:
goto loop;
exit_func:
- srv_lock_timeout_and_monitor_active = FALSE;
+ srv_lock_timeout_active = FALSE;
/* We count the number of threads in os_thread_exit(). A created
thread should always use that to exit and not use return() to exit. */
@@ -2154,6 +2277,11 @@ srv_error_monitor_thread(
fprintf(stderr, "Error monitor thread starts, id %lu\n",
os_thread_pf(os_thread_get_curr_id()));
#endif
+
+#ifdef UNIV_PFS_THREAD
+ pfs_register_thread(srv_error_monitor_thread_key);
+#endif
+
loop:
srv_error_monitor_active = TRUE;
@@ -2258,6 +2386,30 @@ srv_active_wake_master_thread(void)
}
/*******************************************************************//**
+Tells the purge thread that there has been activity in the database
+and wakes up the purge thread if it is suspended (not sleeping). Note
+that there is a small chance that the purge thread stays suspended
+(we do not protect our operation with the kernel mutex, for
+performace reasons). */
+UNIV_INTERN
+void
+srv_wake_purge_thread_if_not_active(void)
+/*=====================================*/
+{
+ ut_ad(!mutex_own(&kernel_mutex));
+
+ if (srv_n_purge_threads > 0
+ && srv_n_threads_active[SRV_WORKER] == 0) {
+
+ mutex_enter(&kernel_mutex);
+
+ srv_release_threads(SRV_WORKER, 1);
+
+ mutex_exit(&kernel_mutex);
+ }
+}
+
+/*******************************************************************//**
Wakes up the master thread if it is suspended or being suspended. */
UNIV_INTERN
void
@@ -2273,6 +2425,25 @@ srv_wake_master_thread(void)
mutex_exit(&kernel_mutex);
}
+/*******************************************************************//**
+Wakes up the purge thread if it's not already awake. */
+UNIV_INTERN
+void
+srv_wake_purge_thread(void)
+/*=======================*/
+{
+ ut_ad(!mutex_own(&kernel_mutex));
+
+ if (srv_n_purge_threads > 0) {
+
+ mutex_enter(&kernel_mutex);
+
+ srv_release_threads(SRV_WORKER, 1);
+
+ mutex_exit(&kernel_mutex);
+ }
+}
+
/**********************************************************************
The master thread is tasked to ensure that flush of log file happens
once every second in the background. This is to ensure that not more
@@ -2293,6 +2464,34 @@ srv_sync_log_buffer_in_background(void)
}
}
+/********************************************************************//**
+Do a full purge, reconfigure the purge sub-system if a dynamic
+change is detected. */
+static
+void
+srv_master_do_purge(void)
+/*=====================*/
+{
+ ulint n_pages_purged;
+
+ ut_ad(!mutex_own(&kernel_mutex));
+
+ ut_a(srv_n_purge_threads == 0);
+
+ do {
+ /* Check for shutdown and change in purge config. */
+ if (srv_fast_shutdown && srv_shutdown_state > 0) {
+ /* Nothing to purge. */
+ n_pages_purged = 0;
+ } else {
+ n_pages_purged = trx_purge(srv_purge_batch_size);
+ }
+
+ srv_sync_log_buffer_in_background();
+
+ } while (n_pages_purged > 0);
+}
+
/*********************************************************************//**
The master thread controlling the server.
@return a dummy parameter */
@@ -2315,13 +2514,18 @@ srv_master_thread(
ulint n_ios_old;
ulint n_ios_very_old;
ulint n_pend_ios;
- ibool skip_sleep = FALSE;
+ ulint next_itr_time;
ulint i;
#ifdef UNIV_DEBUG_THREAD_CREATION
fprintf(stderr, "Master thread starts, id %lu\n",
os_thread_pf(os_thread_get_curr_id()));
#endif
+
+#ifdef UNIV_PFS_THREAD
+ pfs_register_thread(srv_master_thread_key);
+#endif
+
srv_main_thread_process_no = os_proc_get_number();
srv_main_thread_id = os_thread_pf(os_thread_get_curr_id());
@@ -2358,21 +2562,28 @@ loop:
when there is database activity */
srv_last_log_flush_time = time(NULL);
- skip_sleep = FALSE;
+ next_itr_time = ut_time_ms();
for (i = 0; i < 10; i++) {
+ ulint cur_time = ut_time_ms();
n_ios_old = log_sys->n_log_ios + buf_pool->stat.n_pages_read
+ buf_pool->stat.n_pages_written;
srv_main_thread_op_info = "sleeping";
srv_main_1_second_loops++;
- if (!skip_sleep) {
+ if (next_itr_time > cur_time) {
- os_thread_sleep(1000000);
+ /* Get sleep interval in micro seconds. We use
+ ut_min() to avoid long sleep in case of
+ wrap around. */
+ os_thread_sleep(ut_min(1000000,
+ (next_itr_time - cur_time)
+ * 1000));
srv_main_sleeps++;
}
- skip_sleep = FALSE;
+ /* Each iteration should happen at 1 second interval. */
+ next_itr_time = ut_time_ms() + 1000;
/* ALTER TABLE in MySQL requires on Unix that the table handler
can drop tables lazily after there no longer are SELECT
@@ -2424,12 +2635,6 @@ loop:
PCT_IO(100),
IB_ULONGLONG_MAX);
- /* If we had to do the flush, it may have taken
- even more than 1 second, and also, there may be more
- to flush. Do not sleep 1 second during the next
- iteration of this loop. */
-
- skip_sleep = TRUE;
} else if (srv_adaptive_flushing) {
/* Try to keep the rate of flushing of dirty
@@ -2446,7 +2651,6 @@ loop:
BUF_FLUSH_LIST,
n_flush,
IB_ULONGLONG_MAX);
- skip_sleep = TRUE;
}
}
@@ -2503,20 +2707,16 @@ loop:
/* We run a full purge every 10 seconds, even if the server
were active */
- do {
+ if (srv_n_purge_threads == 0) {
+ srv_main_thread_op_info = "master purging";
+
+ srv_master_do_purge();
if (srv_fast_shutdown && srv_shutdown_state > 0) {
goto background_loop;
}
-
- srv_main_thread_op_info = "purging";
- n_pages_purged = trx_purge();
-
- /* Flush logs if needed */
- srv_sync_log_buffer_in_background();
-
- } while (n_pages_purged);
+ }
srv_main_thread_op_info = "flushing buffer pool pages";
@@ -2585,22 +2785,11 @@ background_loop:
os_thread_sleep(100000);
}
- srv_main_thread_op_info = "purging";
-
- /* Run a full purge */
- do {
- if (srv_fast_shutdown && srv_shutdown_state > 0) {
-
- break;
- }
-
- srv_main_thread_op_info = "purging";
- n_pages_purged = trx_purge();
+ if (srv_n_purge_threads == 0) {
+ srv_main_thread_op_info = "master purging";
- /* Flush logs if needed */
- srv_sync_log_buffer_in_background();
-
- } while (n_pages_purged);
+ srv_master_do_purge();
+ }
srv_main_thread_op_info = "reserving kernel mutex";
@@ -2744,6 +2933,7 @@ suspend_thread:
already when the event wait ends */
os_thread_exit(NULL);
+
}
/* When there is user activity, InnoDB will set the event and the
@@ -2753,3 +2943,100 @@ suspend_thread:
OS_THREAD_DUMMY_RETURN; /* Not reached, avoid compiler warning */
}
+
+/*********************************************************************//**
+Asynchronous purge thread.
+@return a dummy parameter */
+UNIV_INTERN
+os_thread_ret_t
+srv_purge_thread(
+/*=============*/
+ void* arg __attribute__((unused))) /*!< in: a dummy parameter
+ required by os_thread_create */
+{
+ srv_slot_t* slot;
+ ulint slot_no = ULINT_UNDEFINED;
+ ulint n_total_purged = ULINT_UNDEFINED;
+
+ ut_a(srv_n_purge_threads == 1);
+
+#ifdef UNIV_DEBUG_THREAD_CREATION
+ fprintf(stderr, "InnoDB: Purge thread running, id %lu\n",
+ os_thread_pf(os_thread_get_curr_id()));
+#endif /* UNIV_DEBUG_THREAD_CREATION */
+
+ mutex_enter(&kernel_mutex);
+
+ slot_no = srv_table_reserve_slot(SRV_WORKER);
+
+ ++srv_n_threads_active[SRV_WORKER];
+
+ mutex_exit(&kernel_mutex);
+
+ while (srv_shutdown_state != SRV_SHUTDOWN_EXIT_THREADS) {
+
+ ulint n_pages_purged;
+
+ /* If there are very few records to purge or the last
+ purge didn't purge any records then wait for activity.
+ We peek at the history len without holding any mutex
+ because in the worst case we will end up waiting for
+ the next purge event. */
+ if (trx_sys->rseg_history_len < srv_purge_batch_size
+ || n_total_purged == 0) {
+
+ os_event_t event;
+
+ mutex_enter(&kernel_mutex);
+
+ event = srv_suspend_thread();
+
+ mutex_exit(&kernel_mutex);
+
+ os_event_wait(event);
+ }
+
+ /* Check for shutdown and whether we should do purge at all. */
+ if (srv_force_recovery >= SRV_FORCE_NO_BACKGROUND
+ || srv_shutdown_state != 0
+ || srv_fast_shutdown) {
+
+ break;
+ }
+
+ n_total_purged = 0;
+
+ /* Purge until there are no more records to purge and there is
+ no change in configuration or server state. */
+ do {
+ n_pages_purged = trx_purge(srv_purge_batch_size);
+
+ n_total_purged += n_pages_purged;
+
+ } while (n_pages_purged > 0 && !srv_fast_shutdown);
+
+ srv_sync_log_buffer_in_background();
+ }
+
+ /* Free the thread local memory. */
+ thr_local_free(os_thread_get_curr_id());
+
+ mutex_enter(&kernel_mutex);
+
+ /* Free the slot for reuse. */
+ slot = srv_table_get_nth_slot(slot_no);
+ slot->in_use = FALSE;
+
+ mutex_exit(&kernel_mutex);
+
+#ifdef UNIV_DEBUG_THREAD_CREATION
+ fprintf(stderr, "InnoDB: Purge thread exiting, id %lu\n",
+ os_thread_pf(os_thread_get_curr_id()));
+#endif /* UNIV_DEBUG_THREAD_CREATION */
+
+ /* We count the number of threads in os_thread_exit(). A created
+ thread should always use that to exit and not use return() to exit. */
+ os_thread_exit(NULL);
+
+ OS_THREAD_DUMMY_RETURN; /* Not reached, avoid compiler warning */
+}
diff --git a/storage/innobase/srv/srv0start.c b/storage/innobase/srv/srv0start.c
index d5f6120ca31..a257fd32aab 100644
--- a/storage/innobase/srv/srv0start.c
+++ b/storage/innobase/srv/srv0start.c
@@ -1,7 +1,8 @@
/*****************************************************************************
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
Copyright (c) 2008, Google Inc.
+Copyright (c) 2009, Percona Inc.
Portions of this file contain modifications contributed and copyrighted by
Google, Inc. Those modifications are gratefully acknowledged and are described
@@ -9,6 +10,13 @@ briefly in the InnoDB documentation. The contributions by Google are
incorporated with their permission, and subject to the conditions contained in
the file COPYING.Google.
+Portions of this file contain modifications contributed and copyrighted
+by Percona Inc.. Those modifications are
+gratefully acknowledged and are described briefly in the InnoDB
+documentation. The contributions by Percona Inc. are incorporated with
+their permission, and subject to the conditions contained in the file
+COPYING.Percona.
+
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
Foundation; version 2 of the License.
@@ -22,32 +30,6 @@ this program; if not, write to the Free Software Foundation, Inc., 59 Temple
Place, Suite 330, Boston, MA 02111-1307 USA
*****************************************************************************/
-/***********************************************************************
-
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
-Copyright (c) 2009, Percona Inc.
-
-Portions of this file contain modifications contributed and copyrighted
-by Percona Inc.. Those modifications are
-gratefully acknowledged and are described briefly in the InnoDB
-documentation. The contributions by Percona Inc. are incorporated with
-their permission, and subject to the conditions contained in the file
-COPYING.Percona.
-
-This program is free software; you can redistribute it and/or modify it
-under the terms of the GNU General Public License as published by the
-Free Software Foundation; version 2 of the License.
-
-This program is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
-Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-
-***********************************************************************/
/********************************************************************//**
@file srv/srv0start.c
@@ -105,6 +87,7 @@ Created 2/16/1996 Heikki Tuuri
# include "btr0pcur.h"
# include "thr0loc.h"
# include "os0sync.h" /* for INNODB_RW_LOCKS_USE_ATOMICS */
+# include "zlib.h" /* for ZLIB_VERSION */
/** Log sequence number immediately after startup */
UNIV_INTERN ib_uint64_t srv_start_lsn;
@@ -143,9 +126,9 @@ static mutex_t ios_mutex;
static ulint ios;
/** io_handler_thread parameters for thread identification */
-static ulint n[SRV_MAX_N_IO_THREADS + 5];
+static ulint n[SRV_MAX_N_IO_THREADS + 6];
/** io_handler_thread identifiers */
-static os_thread_id_t thread_ids[SRV_MAX_N_IO_THREADS + 5];
+static os_thread_id_t thread_ids[SRV_MAX_N_IO_THREADS + 6];
/** We use this mutex to test the return value of pthread_mutex_trylock
on successful locking. HP-UX does NOT return 0, though Linux et al do. */
@@ -159,6 +142,19 @@ static char* srv_monitor_file_name;
#define SRV_N_PENDING_IOS_PER_THREAD OS_AIO_N_PENDING_IOS_PER_THREAD
#define SRV_MAX_N_PENDING_SYNC_IOS 100
+#ifdef UNIV_PFS_THREAD
+/* Keys to register InnoDB threads with performance schema */
+UNIV_INTERN mysql_pfs_key_t io_handler_thread_key;
+UNIV_INTERN mysql_pfs_key_t srv_lock_timeout_thread_key;
+UNIV_INTERN mysql_pfs_key_t srv_error_monitor_thread_key;
+UNIV_INTERN mysql_pfs_key_t srv_monitor_thread_key;
+UNIV_INTERN mysql_pfs_key_t srv_master_thread_key;
+#endif /* UNIV_PFS_THREAD */
+
+#ifdef UNIV_PFS_MUTEX
+/* Key to register ios_mutex_key with performance schema */
+UNIV_INTERN mysql_pfs_key_t ios_mutex_key;
+#endif /* UNIV_PFS_MUTEX */
/*********************************************************************//**
Convert a numeric string that optionally ends in G or M, to a number
@@ -488,6 +484,11 @@ io_handler_thread(
fprintf(stderr, "Io handler thread %lu starts, id %lu\n", segment,
os_thread_pf(os_thread_get_curr_id()));
#endif
+
+#ifdef UNIV_PFS_THREAD
+ pfs_register_thread(io_handler_thread_key);
+#endif /* UNIV_PFS_THREAD */
+
for (i = 0;; i++) {
fil_aio_wait(segment);
@@ -601,7 +602,8 @@ open_or_create_log_file(
sprintf(name + dirnamelen, "%s%lu", "ib_logfile", (ulong) i);
- files[i] = os_file_create(name, OS_FILE_CREATE, OS_FILE_NORMAL,
+ files[i] = os_file_create(innodb_file_log_key, name,
+ OS_FILE_CREATE, OS_FILE_NORMAL,
OS_LOG_FILE, &ret);
if (ret == FALSE) {
if (os_file_get_last_error(FALSE) != OS_FILE_ALREADY_EXISTS
@@ -619,7 +621,8 @@ open_or_create_log_file(
return(DB_ERROR);
}
- files[i] = os_file_create(name, OS_FILE_OPEN, OS_FILE_AIO,
+ files[i] = os_file_create(innodb_file_log_key, name,
+ OS_FILE_OPEN, OS_FILE_AIO,
OS_LOG_FILE, &ret);
if (!ret) {
fprintf(stderr,
@@ -784,7 +787,8 @@ open_or_create_data_files(
/* First we try to create the file: if it already
exists, ret will get value FALSE */
- files[i] = os_file_create(name, OS_FILE_CREATE,
+ files[i] = os_file_create(innodb_file_data_key,
+ name, OS_FILE_CREATE,
OS_FILE_NORMAL,
OS_DATA_FILE, &ret);
@@ -811,7 +815,8 @@ open_or_create_data_files(
srv_start_raw_disk_in_use = TRUE;
srv_created_new_raw = TRUE;
- files[i] = os_file_create(name, OS_FILE_OPEN_RAW,
+ files[i] = os_file_create(innodb_file_data_key,
+ name, OS_FILE_OPEN_RAW,
OS_FILE_NORMAL,
OS_DATA_FILE, &ret);
if (!ret) {
@@ -844,14 +849,17 @@ open_or_create_data_files(
if (srv_data_file_is_raw_partition[i] == SRV_OLD_RAW) {
files[i] = os_file_create(
+ innodb_file_data_key,
name, OS_FILE_OPEN_RAW,
OS_FILE_NORMAL, OS_DATA_FILE, &ret);
} else if (i == 0) {
files[i] = os_file_create(
+ innodb_file_data_key,
name, OS_FILE_OPEN_RETRY,
OS_FILE_NORMAL, OS_DATA_FILE, &ret);
} else {
files[i] = os_file_create(
+ innodb_file_data_key,
name, OS_FILE_OPEN, OS_FILE_NORMAL,
OS_DATA_FILE, &ret);
}
@@ -994,7 +1002,7 @@ skip_size_check:
ios = 0;
- mutex_create(&ios_mutex, SYNC_NO_ORDER_CHECK);
+ mutex_create(ios_mutex_key, &ios_mutex, SYNC_NO_ORDER_CHECK);
return(DB_SUCCESS);
}
@@ -1074,7 +1082,11 @@ innobase_start_or_create_for_mysql(void)
#ifdef UNIV_IBUF_DEBUG
fprintf(stderr,
"InnoDB: !!!!!!!! UNIV_IBUF_DEBUG switched on !!!!!!!!!\n"
- "InnoDB: Crash recovery will fail with UNIV_IBUF_DEBUG\n");
+# ifdef UNIV_IBUF_COUNT_DEBUG
+ "InnoDB: !!!!!!!! UNIV_IBUF_COUNT_DEBUG switched on !!!!!!!!!\n"
+ "InnoDB: Crash recovery will fail with UNIV_IBUF_COUNT_DEBUG\n"
+# endif
+ );
#endif
#ifdef UNIV_SYNC_DEBUG
@@ -1101,7 +1113,15 @@ innobase_start_or_create_for_mysql(void)
"InnoDB: The InnoDB memory heap is disabled\n");
}
- fprintf(stderr, "InnoDB: %s\n", IB_ATOMICS_STARTUP_MSG);
+ fputs("InnoDB: " IB_ATOMICS_STARTUP_MSG
+ "\nInnoDB: Compressed tables use zlib " ZLIB_VERSION
+#ifdef UNIV_ZIP_DEBUG
+ " with validation"
+#endif /* UNIV_ZIP_DEBUG */
+#ifdef UNIV_ZIP_COPY
+ " and extra copying"
+#endif /* UNIV_ZIP_COPY */
+ "\n" , stderr);
/* Since InnoDB does not currently clean up all its internal data
structures in MySQL Embedded Server Library server_end(), we
@@ -1127,7 +1147,6 @@ innobase_start_or_create_for_mysql(void)
srv_is_being_started = TRUE;
srv_startup_is_before_trx_rollback_phase = TRUE;
- os_aio_use_native_aio = FALSE;
#ifdef __WIN__
switch (os_get_os_version()) {
@@ -1139,14 +1158,29 @@ innobase_start_or_create_for_mysql(void)
but when run in conjunction with InnoDB Hot Backup, it seemed
to corrupt the data files. */
- os_aio_use_native_aio = FALSE;
+ srv_use_native_aio = FALSE;
break;
default:
/* On Win 2000 and XP use async i/o */
- os_aio_use_native_aio = TRUE;
+ srv_use_native_aio = TRUE;
break;
}
+
+#elif defined(LINUX_NATIVE_AIO)
+
+ if (srv_use_native_aio) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Using Linux native AIO\n");
+ }
+#else
+ /* Currently native AIO is supported only on windows and linux
+ and that also when the support is compiled in. In all other
+ cases, we ignore the setting of innodb_use_native_aio. */
+ srv_use_native_aio = FALSE;
+
#endif
+
if (srv_file_flush_method_str == NULL) {
/* These are the default options */
@@ -1171,11 +1205,11 @@ innobase_start_or_create_for_mysql(void)
#else
} else if (0 == ut_strcmp(srv_file_flush_method_str, "normal")) {
srv_win_file_flush_method = SRV_WIN_IO_NORMAL;
- os_aio_use_native_aio = FALSE;
+ srv_use_native_aio = FALSE;
} else if (0 == ut_strcmp(srv_file_flush_method_str, "unbuffered")) {
srv_win_file_flush_method = SRV_WIN_IO_UNBUFFERED;
- os_aio_use_native_aio = FALSE;
+ srv_use_native_aio = FALSE;
} else if (0 == ut_strcmp(srv_file_flush_method_str,
"async_unbuffered")) {
@@ -1226,7 +1260,8 @@ innobase_start_or_create_for_mysql(void)
return((int) err);
}
- mutex_create(&srv_monitor_file_mutex, SYNC_NO_ORDER_CHECK);
+ mutex_create(srv_monitor_file_mutex_key,
+ &srv_monitor_file_mutex, SYNC_NO_ORDER_CHECK);
if (srv_innodb_status) {
srv_monitor_file_name = mem_alloc(
@@ -1248,14 +1283,16 @@ innobase_start_or_create_for_mysql(void)
}
}
- mutex_create(&srv_dict_tmpfile_mutex, SYNC_DICT_OPERATION);
+ mutex_create(srv_dict_tmpfile_mutex_key,
+ &srv_dict_tmpfile_mutex, SYNC_DICT_OPERATION);
srv_dict_tmpfile = os_file_create_tmpfile();
if (!srv_dict_tmpfile) {
return(DB_ERROR);
}
- mutex_create(&srv_misc_tmpfile_mutex, SYNC_ANY_LATCH);
+ mutex_create(srv_misc_tmpfile_mutex_key,
+ &srv_misc_tmpfile_mutex, SYNC_ANY_LATCH);
srv_misc_tmpfile = os_file_create_tmpfile();
if (!srv_misc_tmpfile) {
@@ -1280,7 +1317,7 @@ innobase_start_or_create_for_mysql(void)
/* TODO: Investigate if SRV_N_PENDING_IOS_PER_THREAD (32) limit
still applies to windows. */
- if (!os_aio_use_native_aio) {
+ if (!srv_use_native_aio) {
io_limit = 8 * SRV_N_PENDING_IOS_PER_THREAD;
} else {
io_limit = SRV_N_PENDING_IOS_PER_THREAD;
@@ -1493,12 +1530,19 @@ innobase_start_or_create_for_mysql(void)
if (create_new_db) {
mtr_start(&mtr);
+
fsp_header_init(0, sum_of_new_sizes, &mtr);
mtr_commit(&mtr);
+ /* To maintain backward compatibility we create only
+ the first rollback segment before the double write buffer.
+ All the remaining rollback segments will be created later,
+ after the double write buffer has been created. */
trx_sys_create();
+
dict_create();
+
srv_startup_is_before_trx_rollback_phase = FALSE;
#ifdef UNIV_LOG_ARCHIVE
@@ -1517,7 +1561,9 @@ innobase_start_or_create_for_mysql(void)
in any disk i/o, first call dict_boot */
dict_boot();
+
trx_sys_init_at_db_start();
+
srv_startup_is_before_trx_rollback_phase = FALSE;
/* Initialize the fsp free limit global variable in the log
@@ -1575,6 +1621,14 @@ innobase_start_or_create_for_mysql(void)
dict_boot();
trx_sys_init_at_db_start();
+ /* Initialize the fsp free limit global variable in the log
+ system */
+ fsp_header_get_free_limit();
+
+ /* recv_recovery_from_checkpoint_finish needs trx lists which
+ are initialized in trx_sys_init_at_db_start(). */
+
+ recv_recovery_from_checkpoint_finish();
if (srv_force_recovery < SRV_FORCE_NO_IBUF_MERGE) {
/* The following call is necessary for the insert
buffer to work with multiple tablespaces. We must
@@ -1590,26 +1644,14 @@ innobase_start_or_create_for_mysql(void)
every table in the InnoDB data dictionary that has
an .ibd file.
- We also determine the maximum tablespace id used.
-
- TODO: We may have incomplete transactions in the
- data dictionary tables. Does that harm the scanning of
- the data dictionary below? */
+ We also determine the maximum tablespace id used. */
dict_check_tablespaces_and_store_max_id(
recv_needed_recovery);
}
srv_startup_is_before_trx_rollback_phase = FALSE;
-
- /* Initialize the fsp free limit global variable in the log
- system */
- fsp_header_get_free_limit();
-
- /* recv_recovery_from_checkpoint_finish needs trx lists which
- are initialized in trx_sys_init_at_db_start(). */
-
- recv_recovery_from_checkpoint_finish();
+ recv_recovery_rollback_active();
/* It is possible that file_format tag has never
been set. In this case we initialize it to minimum
@@ -1658,15 +1700,18 @@ innobase_start_or_create_for_mysql(void)
/* fprintf(stderr, "Max allowed record size %lu\n",
page_get_free_space_of_empty() / 2); */
- /* Create the thread which watches the timeouts for lock waits
- and prints InnoDB monitor info */
-
- os_thread_create(&srv_lock_timeout_and_monitor_thread, NULL,
+ /* Create the thread which watches the timeouts for lock waits */
+ os_thread_create(&srv_lock_timeout_thread, NULL,
thread_ids + 2 + SRV_MAX_N_IO_THREADS);
/* Create the thread which warns of long semaphore waits */
os_thread_create(&srv_error_monitor_thread, NULL,
thread_ids + 3 + SRV_MAX_N_IO_THREADS);
+
+ /* Create the thread which prints InnoDB monitor info */
+ os_thread_create(&srv_monitor_thread, NULL,
+ thread_ids + 4 + SRV_MAX_N_IO_THREADS);
+
srv_is_being_started = FALSE;
if (trx_doublewrite == NULL) {
@@ -1675,6 +1720,14 @@ innobase_start_or_create_for_mysql(void)
trx_sys_create_doublewrite_buf();
}
+ /* Here the double write buffer has already been created and so
+ any new rollback segments will be allocated after the double
+ write buffer. The default segment should already exist.
+ We create the new segments only if it's a new database or
+ the database was shutdown cleanly. */
+
+ trx_sys_create_rsegs(TRX_SYS_N_RSEGS - 1);
+
err = dict_create_or_check_foreign_constraint_tables();
if (err != DB_SUCCESS) {
@@ -1686,6 +1739,16 @@ innobase_start_or_create_for_mysql(void)
os_thread_create(&srv_master_thread, NULL, thread_ids
+ (1 + SRV_MAX_N_IO_THREADS));
+
+ /* Currently we allow only a single purge thread. */
+ ut_a(srv_n_purge_threads == 0 || srv_n_purge_threads == 1);
+
+ /* If the user has requested a separate purge thread then
+ start the purge thread. */
+ if (srv_n_purge_threads == 1) {
+ os_thread_create(&srv_purge_thread, NULL, NULL);
+ }
+
#ifdef UNIV_DEBUG
/* buf_debug_prints = TRUE; */
#endif /* UNIV_DEBUG */
@@ -1779,7 +1842,7 @@ innobase_start_or_create_for_mysql(void)
if (srv_print_verbose_log) {
ut_print_timestamp(stderr);
fprintf(stderr,
- " InnoDB Plugin %s started; "
+ " InnoDB %s started; "
"log sequence number %llu\n",
INNODB_VERSION_STR, srv_start_lsn);
}
@@ -1939,7 +2002,10 @@ innobase_shutdown_for_mysql(void)
/* c. We wake the master thread so that it exits */
srv_wake_master_thread();
- /* d. Exit the i/o threads */
+ /* d. We wake the purge thread so that it exits */
+ srv_wake_purge_thread();
+
+ /* e. Exit the i/o threads */
os_aio_wake_all_threads_at_shutdown();
diff --git a/storage/innobase/sync/sync0arr.c b/storage/innobase/sync/sync0arr.c
index ed9e25bf2f2..2cdac11a608 100644
--- a/storage/innobase/sync/sync0arr.c
+++ b/storage/innobase/sync/sync0arr.c
@@ -138,6 +138,11 @@ struct sync_array_struct {
since creation of the array */
};
+#ifdef UNIV_PFS_MUTEX
+/* Key to register the mutex with performance schema */
+UNIV_INTERN mysql_pfs_key_t syn_arr_mutex_key;
+#endif
+
#ifdef UNIV_SYNC_DEBUG
/******************************************************************//**
This function is called only in the debug version. Detects a deadlock
@@ -247,7 +252,8 @@ sync_array_create(
if (protection == SYNC_ARRAY_OS_MUTEX) {
arr->os_mutex = os_mutex_create(NULL);
} else if (protection == SYNC_ARRAY_MUTEX) {
- mutex_create(&arr->mutex, SYNC_NO_ORDER_CHECK);
+ mutex_create(syn_arr_mutex_key,
+ &arr->mutex, SYNC_NO_ORDER_CHECK);
} else {
ut_error;
}
diff --git a/storage/innobase/sync/sync0rw.c b/storage/innobase/sync/sync0rw.c
index d231b6acdf7..c05c823ff61 100644
--- a/storage/innobase/sync/sync0rw.c
+++ b/storage/innobase/sync/sync0rw.c
@@ -168,12 +168,22 @@ UNIV_INTERN ib_int64_t rw_x_exit_count = 0;
UNIV_INTERN rw_lock_list_t rw_lock_list;
UNIV_INTERN mutex_t rw_lock_list_mutex;
+#ifdef UNIV_PFS_MUTEX
+UNIV_INTERN mysql_pfs_key_t rw_lock_list_mutex_key;
+UNIV_INTERN mysql_pfs_key_t rw_lock_mutex_key;
+#endif /* UNIV_PFS_MUTEX */
+
#ifdef UNIV_SYNC_DEBUG
/* The global mutex which protects debug info lists of all rw-locks.
To modify the debug info list of an rw-lock, this mutex has to be
acquired in addition to the mutex protecting the lock. */
UNIV_INTERN mutex_t rw_lock_debug_mutex;
+
+# ifdef UNIV_PFS_MUTEX
+UNIV_INTERN mysql_pfs_key_t rw_lock_debug_mutex_key;
+# endif
+
/* If deadlock detection does not get immediately the mutex,
it may wait for this event */
UNIV_INTERN os_event_t rw_lock_debug_event;
@@ -231,7 +241,7 @@ rw_lock_create_func(
# ifdef UNIV_SYNC_DEBUG
ulint level, /*!< in: level */
# endif /* UNIV_SYNC_DEBUG */
- const char* cmutex_name, /*!< in: mutex name */
+ const char* cmutex_name, /*!< in: mutex name */
#endif /* UNIV_DEBUG */
const char* cfile_name, /*!< in: file name where created */
ulint cline) /*!< in: file line where created */
@@ -240,7 +250,8 @@ rw_lock_create_func(
created, then the following call initializes the sync system. */
#ifndef INNODB_RW_LOCKS_USE_ATOMICS
- mutex_create(rw_lock_get_mutex(lock), SYNC_NO_ORDER_CHECK);
+ mutex_create(rw_lock_mutex_key, rw_lock_get_mutex(lock),
+ SYNC_NO_ORDER_CHECK);
lock->mutex.cfile_name = cfile_name;
lock->mutex.cline = cline;
@@ -298,8 +309,8 @@ the rw-lock is freed. Removes an rw-lock object from the global list. The
rw-lock is checked to be in the non-locked state. */
UNIV_INTERN
void
-rw_lock_free(
-/*=========*/
+rw_lock_free_func(
+/*==============*/
rw_lock_t* lock) /*!< in: rw-lock */
{
ut_ad(rw_lock_validate(lock));
@@ -607,7 +618,7 @@ rw_lock_x_lock_func(
{
ulint index; /*!< index of the reserved wait cell */
ulint i; /*!< spin round count */
- ibool spinning = FALSE;
+ ibool spinning = FALSE;
ut_ad(rw_lock_validate(lock));
diff --git a/storage/innobase/sync/sync0sync.c b/storage/innobase/sync/sync0sync.c
index 569fc6328c4..b9b83adba00 100644
--- a/storage/innobase/sync/sync0sync.c
+++ b/storage/innobase/sync/sync0sync.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
Copyright (c) 2008, Google Inc.
Portions of this file contain modifications contributed and copyrighted by
@@ -198,6 +198,10 @@ UNIV_INTERN sync_thread_t* sync_thread_level_arrays;
/** Mutex protecting sync_thread_level_arrays */
UNIV_INTERN mutex_t sync_thread_mutex;
+
+# ifdef UNIV_PFS_MUTEX
+UNIV_INTERN mysql_pfs_key_t sync_thread_mutex_key;
+# endif /* UNIV_PFS_MUTEX */
#endif /* UNIV_SYNC_DEBUG */
/** Global list of database mutexes (not OS mutexes) created. */
@@ -206,6 +210,10 @@ UNIV_INTERN ut_list_base_node_t mutex_list;
/** Mutex protecting the mutex_list variable */
UNIV_INTERN mutex_t mutex_list_mutex;
+#ifdef UNIV_PFS_MUTEX
+UNIV_INTERN mysql_pfs_key_t mutex_list_mutex_key;
+#endif /* UNIV_PFS_MUTEX */
+
#ifdef UNIV_SYNC_DEBUG
/** Latching order checks start when this is set TRUE */
UNIV_INTERN ibool sync_order_checks_on = FALSE;
@@ -302,19 +310,29 @@ mutex_create_func(
}
/******************************************************************//**
+NOTE! Use the corresponding macro mutex_free(), not directly this function!
Calling this function is obligatory only if the memory buffer containing
the mutex is freed. Removes a mutex object from the mutex list. The mutex
is checked to be in the reset state. */
UNIV_INTERN
void
-mutex_free(
-/*=======*/
+mutex_free_func(
+/*============*/
mutex_t* mutex) /*!< in: mutex */
{
ut_ad(mutex_validate(mutex));
ut_a(mutex_get_lock_word(mutex) == 0);
ut_a(mutex_get_waiters(mutex) == 0);
+#ifdef UNIV_MEM_DEBUG
+ if (mutex == &mem_hash_mutex) {
+ ut_ad(UT_LIST_GET_LEN(mutex_list) == 1);
+ ut_ad(UT_LIST_GET_FIRST(mutex_list) == &mem_hash_mutex);
+ UT_LIST_REMOVE(list, mutex_list, mutex);
+ goto func_exit;
+ }
+#endif /* UNIV_MEM_DEBUG */
+
if (mutex != &mutex_list_mutex
#ifdef UNIV_SYNC_DEBUG
&& mutex != &sync_thread_mutex
@@ -336,7 +354,9 @@ mutex_free(
}
os_event_free(mutex->event);
-
+#ifdef UNIV_MEM_DEBUG
+func_exit:
+#endif /* UNIV_MEM_DEBUG */
#if !defined(HAVE_ATOMIC_BUILTINS)
os_fast_mutex_free(&(mutex->os_fast_mutex));
#endif
@@ -947,12 +967,62 @@ sync_thread_levels_contain(
}
/******************************************************************//**
+Checks if the level array for the current thread contains a
+mutex or rw-latch at the specified level.
+@return a matching latch, or NULL if not found */
+UNIV_INTERN
+void*
+sync_thread_levels_contains(
+/*========================*/
+ ulint level) /*!< in: latching order level
+ (SYNC_DICT, ...)*/
+{
+ sync_level_t* arr;
+ sync_thread_t* thread_slot;
+ sync_level_t* slot;
+ ulint i;
+
+ if (!sync_order_checks_on) {
+
+ return(NULL);
+ }
+
+ mutex_enter(&sync_thread_mutex);
+
+ thread_slot = sync_thread_level_arrays_find_slot();
+
+ if (thread_slot == NULL) {
+
+ mutex_exit(&sync_thread_mutex);
+
+ return(NULL);
+ }
+
+ arr = thread_slot->levels;
+
+ for (i = 0; i < SYNC_THREAD_N_LEVELS; i++) {
+
+ slot = sync_thread_levels_get_nth(arr, i);
+
+ if (slot->latch != NULL && slot->level == level) {
+
+ mutex_exit(&sync_thread_mutex);
+ return(slot->latch);
+ }
+ }
+
+ mutex_exit(&sync_thread_mutex);
+
+ return(NULL);
+}
+
+/******************************************************************//**
Checks that the level array for the current thread is empty.
-@return TRUE if empty except the exceptions specified below */
+@return a latch, or NULL if empty except the exceptions specified below */
UNIV_INTERN
-ibool
-sync_thread_levels_empty_gen(
-/*=========================*/
+void*
+sync_thread_levels_nonempty_gen(
+/*============================*/
ibool dict_mutex_allowed) /*!< in: TRUE if dictionary mutex is
allowed to be owned by the thread,
also purge_is_running mutex is
@@ -965,7 +1035,7 @@ sync_thread_levels_empty_gen(
if (!sync_order_checks_on) {
- return(TRUE);
+ return(NULL);
}
mutex_enter(&sync_thread_mutex);
@@ -976,7 +1046,7 @@ sync_thread_levels_empty_gen(
mutex_exit(&sync_thread_mutex);
- return(TRUE);
+ return(NULL);
}
arr = thread_slot->levels;
@@ -993,13 +1063,13 @@ sync_thread_levels_empty_gen(
mutex_exit(&sync_thread_mutex);
ut_error;
- return(FALSE);
+ return(slot->latch);
}
}
mutex_exit(&sync_thread_mutex);
- return(TRUE);
+ return(NULL);
}
/******************************************************************//**
@@ -1092,6 +1162,8 @@ sync_thread_add_level(
case SYNC_TRX_SYS_HEADER:
case SYNC_FILE_FORMAT_TAG:
case SYNC_DOUBLEWRITE:
+ case SYNC_BUF_FLUSH_LIST:
+ case SYNC_BUF_FLUSH_ORDER:
case SYNC_BUF_POOL:
case SYNC_SEARCH_SYS:
case SYNC_SEARCH_SYS_CONF:
@@ -1337,18 +1409,22 @@ sync_init(void)
/* Init the mutex list and create the mutex to protect it. */
UT_LIST_INIT(mutex_list);
- mutex_create(&mutex_list_mutex, SYNC_NO_ORDER_CHECK);
+ mutex_create(mutex_list_mutex_key, &mutex_list_mutex,
+ SYNC_NO_ORDER_CHECK);
#ifdef UNIV_SYNC_DEBUG
- mutex_create(&sync_thread_mutex, SYNC_NO_ORDER_CHECK);
+ mutex_create(sync_thread_mutex_key, &sync_thread_mutex,
+ SYNC_NO_ORDER_CHECK);
#endif /* UNIV_SYNC_DEBUG */
/* Init the rw-lock list and create the mutex to protect it. */
UT_LIST_INIT(rw_lock_list);
- mutex_create(&rw_lock_list_mutex, SYNC_NO_ORDER_CHECK);
+ mutex_create(rw_lock_list_mutex_key, &rw_lock_list_mutex,
+ SYNC_NO_ORDER_CHECK);
#ifdef UNIV_SYNC_DEBUG
- mutex_create(&rw_lock_debug_mutex, SYNC_NO_ORDER_CHECK);
+ mutex_create(rw_lock_debug_mutex_key, &rw_lock_debug_mutex,
+ SYNC_NO_ORDER_CHECK);
rw_lock_debug_event = os_event_create(NULL);
rw_lock_debug_waiters = FALSE;
@@ -1370,6 +1446,12 @@ sync_close(void)
mutex = UT_LIST_GET_FIRST(mutex_list);
while (mutex) {
+#ifdef UNIV_MEM_DEBUG
+ if (mutex == &mem_hash_mutex) {
+ mutex = UT_LIST_GET_NEXT(list, mutex);
+ continue;
+ }
+#endif /* UNIV_MEM_DEBUG */
mutex_free(mutex);
mutex = UT_LIST_GET_FIRST(mutex_list);
}
diff --git a/storage/innobase/thr/thr0loc.c b/storage/innobase/thr/thr0loc.c
index 59a234a6b72..045ff3e9fb1 100644
--- a/storage/innobase/thr/thr0loc.c
+++ b/storage/innobase/thr/thr0loc.c
@@ -53,6 +53,11 @@ static hash_table_t* thr_local_hash = NULL;
/** Thread local data */
typedef struct thr_local_struct thr_local_t;
+#ifdef UNIV_PFS_MUTEX
+/* Key to register the mutex with performance schema */
+UNIV_INTERN mysql_pfs_key_t thr_local_mutex_key;
+#endif /* UNIV_PFS_MUTEX */
+
/** @brief Thread local data.
The private data for each thread should be put to
the structure below and the accessor functions written
@@ -244,7 +249,8 @@ thr_local_init(void)
thr_local_hash = hash_create(OS_THREAD_MAX_N + 100);
- mutex_create(&thr_local_mutex, SYNC_THR_LOCAL);
+ mutex_create(thr_local_mutex_key,
+ &thr_local_mutex, SYNC_THR_LOCAL);
}
/********************************************************************
diff --git a/storage/innobase/trx/trx0i_s.c b/storage/innobase/trx/trx0i_s.c
index 1b20eaabf42..ba8f998affd 100644
--- a/storage/innobase/trx/trx0i_s.c
+++ b/storage/innobase/trx/trx0i_s.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 2007, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2007, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -28,11 +28,18 @@ table cache" for later retrieval.
Created July 17, 2007 Vasil Dimov
*******************************************************/
+/* Found during the build of 5.5.3 on Linux 2.4 and early 2.6 kernels:
+ The includes "univ.i" -> "my_global.h" cause a different path
+ to be taken further down with pthread functions and types,
+ so they must come first.
+ From the symptoms, this is related to bug#46587 in the MySQL bug DB.
+*/
+#include "univ.i"
+
#include <mysql/plugin.h>
#include "mysql_addons.h"
-#include "univ.i"
#include "buf0buf.h"
#include "dict0dict.h"
#include "ha0storage.h"
@@ -186,6 +193,15 @@ INFORMATION SCHEMA tables is fetched and later retrieved by the C++
code in handler/i_s.cc. */
UNIV_INTERN trx_i_s_cache_t* trx_i_s_cache = &trx_i_s_cache_static;
+/* Key to register the lock/mutex with performance schema */
+#ifdef UNIV_PFS_RWLOCK
+UNIV_INTERN mysql_pfs_key_t trx_i_s_cache_lock_key;
+#endif /* UNIV_PFS_RWLOCK */
+
+#ifdef UNIV_PFS_MUTEX
+UNIV_INTERN mysql_pfs_key_t cache_last_read_mutex_key;
+#endif /* UNIV_PFS_MUTEX */
+
/*******************************************************************//**
For a record lock that is in waiting state retrieves the only bit that
is set, for a table lock returns ULINT_UNDEFINED.
@@ -1246,11 +1262,13 @@ trx_i_s_cache_init(
release trx_i_s_cache_t::last_read_mutex
release trx_i_s_cache_t::rw_lock */
- rw_lock_create(&cache->rw_lock, SYNC_TRX_I_S_RWLOCK);
+ rw_lock_create(trx_i_s_cache_lock_key, &cache->rw_lock,
+ SYNC_TRX_I_S_RWLOCK);
cache->last_read = 0;
- mutex_create(&cache->last_read_mutex, SYNC_TRX_I_S_LAST_READ);
+ mutex_create(cache_last_read_mutex_key,
+ &cache->last_read_mutex, SYNC_TRX_I_S_LAST_READ);
table_cache_init(&cache->innodb_trx, sizeof(i_s_trx_row_t));
table_cache_init(&cache->innodb_locks, sizeof(i_s_locks_row_t));
diff --git a/storage/innobase/trx/trx0purge.c b/storage/innobase/trx/trx0purge.c
index abbfa3d7f81..550a8c9c4b3 100644
--- a/storage/innobase/trx/trx0purge.c
+++ b/storage/innobase/trx/trx0purge.c
@@ -41,7 +41,7 @@ Created 3/26/1996 Heikki Tuuri
#include "row0purge.h"
#include "row0upd.h"
#include "trx0rec.h"
-#include "srv0que.h"
+#include "srv0srv.h"
#include "os0thread.h"
/** The global data structure coordinating a purge */
@@ -51,6 +51,16 @@ UNIV_INTERN trx_purge_t* purge_sys = NULL;
which needs no purge */
UNIV_INTERN trx_undo_rec_t trx_purge_dummy_rec;
+#ifdef UNIV_PFS_RWLOCK
+/* Key to register trx_purge_latch with performance schema */
+UNIV_INTERN mysql_pfs_key_t trx_purge_latch_key;
+#endif /* UNIV_PFS_RWLOCK */
+
+#ifdef UNIV_PFS_MUTEX
+/* Key to register purge_sys_mutex with performance schema */
+UNIV_INTERN mysql_pfs_key_t purge_sys_mutex_key;
+#endif /* UNIV_PFS_MUTEX */
+
/*****************************************************************//**
Checks if trx_id is >= purge_view: then it is guaranteed that its update
undo log still exists in the system.
@@ -227,9 +237,11 @@ trx_purge_sys_create(void)
purge_sys->purge_undo_no = ut_dulint_zero;
purge_sys->next_stored = FALSE;
- rw_lock_create(&purge_sys->latch, SYNC_PURGE_LATCH);
+ rw_lock_create(trx_purge_latch_key,
+ &purge_sys->latch, SYNC_PURGE_LATCH);
- mutex_create(&purge_sys->mutex, SYNC_PURGE_SYS);
+ mutex_create(purge_sys_mutex_key,
+ &purge_sys->mutex, SYNC_PURGE_SYS);
purge_sys->heap = mem_heap_create(256);
@@ -352,6 +364,11 @@ trx_purge_add_update_undo_to_history(
trx_sys->rseg_history_len++;
mutex_exit(&kernel_mutex);
+ if (!(trx_sys->rseg_history_len % srv_purge_batch_size)) {
+ /* Inform the purge thread that there is work to do. */
+ srv_wake_purge_thread_if_not_active();
+ }
+
/* Write the trx number to the undo log header */
mlog_write_dulint(undo_header + TRX_UNDO_TRX_NO, trx->no, mtr);
/* Write information about delete markings to the undo log header */
@@ -1084,8 +1101,10 @@ This function runs a purge batch.
@return number of undo log pages handled in the batch */
UNIV_INTERN
ulint
-trx_purge(void)
-/*===========*/
+trx_purge(
+/*======*/
+ ulint limit) /*!< in: the maximum number of records to
+ purge in one batch */
{
que_thr_t* thr;
/* que_thr_t* thr2; */
@@ -1146,9 +1165,7 @@ trx_purge(void)
purge_sys->state = TRX_PURGE_ON;
- /* Handle at most 20 undo log pages in one purge batch */
-
- purge_sys->handle_limit = purge_sys->n_pages_handled + 20;
+ purge_sys->handle_limit = purge_sys->n_pages_handled + limit;
old_pages_handled = purge_sys->n_pages_handled;
diff --git a/storage/innobase/trx/trx0rec.c b/storage/innobase/trx/trx0rec.c
index 5097cf18dcd..bcc1f81381e 100644
--- a/storage/innobase/trx/trx0rec.c
+++ b/storage/innobase/trx/trx0rec.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -977,6 +977,7 @@ trx_undo_update_rec_get_update(
fprintf(stderr, "\n"
"InnoDB: n_fields = %lu, i = %lu, ptr %p\n",
(ulong) n_fields, (ulong) i, ptr);
+ *upd = NULL;
return(NULL);
}
diff --git a/storage/innobase/trx/trx0roll.c b/storage/innobase/trx/trx0roll.c
index c925478cdf4..6e72b13e116 100644
--- a/storage/innobase/trx/trx0roll.c
+++ b/storage/innobase/trx/trx0roll.c
@@ -615,6 +615,10 @@ trx_rollback_or_clean_all_recovered(
/*!< in: a dummy parameter required by
os_thread_create */
{
+#ifdef UNIV_PFS_THREAD
+ pfs_register_thread(trx_rollback_clean_thread_key);
+#endif /* UNIV_PFS_THREAD */
+
trx_rollback_or_clean_recovered(TRUE);
/* We count the number of threads in os_thread_exit(). A created
diff --git a/storage/innobase/trx/trx0rseg.c b/storage/innobase/trx/trx0rseg.c
index 8d754788e2a..b458364b05d 100644
--- a/storage/innobase/trx/trx0rseg.c
+++ b/storage/innobase/trx/trx0rseg.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -34,6 +34,11 @@ Created 3/26/1996 Heikki Tuuri
#include "srv0srv.h"
#include "trx0purge.h"
+#ifdef UNIV_PFS_MUTEX
+/* Key to register rseg_mutex_key with performance schema */
+UNIV_INTERN mysql_pfs_key_t rseg_mutex_key;
+#endif /* UNIV_PFS_MUTEX */
+
/******************************************************************//**
Looks for a rollback segment, based on the rollback segment id.
@return rollback segment */
@@ -46,11 +51,9 @@ trx_rseg_get_on_id(
trx_rseg_t* rseg;
rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);
- ut_ad(rseg);
- while (rseg->id != id) {
+ while (rseg && rseg->id != id) {
rseg = UT_LIST_GET_NEXT(rseg_list, rseg);
- ut_ad(rseg);
}
return(rseg);
@@ -68,7 +71,7 @@ trx_rseg_header_create(
ulint zip_size, /*!< in: compressed page size in bytes
or 0 for uncompressed pages */
ulint max_size, /*!< in: max size in pages */
- ulint* slot_no, /*!< out: rseg id == slot number in trx sys */
+ ulint rseg_slot_no, /*!< in: rseg id == slot number in trx sys */
mtr_t* mtr) /*!< in: mtr */
{
ulint page_no;
@@ -81,14 +84,6 @@ trx_rseg_header_create(
ut_ad(mutex_own(&kernel_mutex));
ut_ad(mtr_memo_contains(mtr, fil_space_get_latch(space, NULL),
MTR_MEMO_X_LOCK));
- sys_header = trx_sysf_get(mtr);
-
- *slot_no = trx_sysf_rseg_find_free(mtr);
-
- if (*slot_no == ULINT_UNDEFINED) {
-
- return(FIL_NULL);
- }
/* Allocate a new file segment for the rollback segment */
block = fseg_create(space, 0,
@@ -122,11 +117,13 @@ trx_rseg_header_create(
trx_rsegf_set_nth_undo(rsegf, i, FIL_NULL, mtr);
}
- /* Add the rollback segment info to the free slot in the trx system
- header */
+ /* Add the rollback segment info to the free slot in
+ the trx system header */
- trx_sysf_rseg_set_space(sys_header, *slot_no, space, mtr);
- trx_sysf_rseg_set_page_no(sys_header, *slot_no, page_no, mtr);
+ sys_header = trx_sysf_get(mtr);
+
+ trx_sysf_rseg_set_space(sys_header, rseg_slot_no, space, mtr);
+ trx_sysf_rseg_set_page_no(sys_header, rseg_slot_no, page_no, mtr);
return(page_no);
}
@@ -191,23 +188,23 @@ trx_rseg_mem_create(
ulint page_no, /*!< in: page number of the segment header */
mtr_t* mtr) /*!< in: mtr */
{
- trx_rsegf_t* rseg_header;
+ ulint len;
trx_rseg_t* rseg;
- trx_ulogf_t* undo_log_hdr;
fil_addr_t node_addr;
+ trx_rsegf_t* rseg_header;
+ trx_ulogf_t* undo_log_hdr;
ulint sum_of_undo_sizes;
- ulint len;
ut_ad(mutex_own(&kernel_mutex));
- rseg = mem_alloc(sizeof(trx_rseg_t));
+ rseg = mem_zalloc(sizeof(trx_rseg_t));
rseg->id = id;
rseg->space = space;
rseg->zip_size = zip_size;
rseg->page_no = page_no;
- mutex_create(&rseg->mutex, SYNC_RSEG);
+ mutex_create(rseg_mutex_key, &rseg->mutex, SYNC_RSEG);
UT_LIST_ADD_LAST(rseg_list, trx_sys->rseg_list, rseg);
@@ -250,75 +247,108 @@ trx_rseg_mem_create(
return(rseg);
}
-/*********************************************************************//**
-Creates the memory copies for rollback segments and initializes the
+/********************************************************************
+Creates the memory copies for the rollback segments and initializes the
rseg list and array in trx_sys at a database startup. */
-UNIV_INTERN
+static
void
-trx_rseg_list_and_array_init(
-/*=========================*/
+trx_rseg_create_instance(
+/*=====================*/
trx_sysf_t* sys_header, /*!< in: trx system header */
mtr_t* mtr) /*!< in: mtr */
{
- ulint i;
- ulint page_no;
- ulint space;
-
- UT_LIST_INIT(trx_sys->rseg_list);
-
- trx_sys->rseg_history_len = 0;
+ ulint i;
for (i = 0; i < TRX_SYS_N_RSEGS; i++) {
+ ulint page_no;
page_no = trx_sysf_rseg_get_page_no(sys_header, i, mtr);
if (page_no == FIL_NULL) {
-
trx_sys_set_nth_rseg(trx_sys, i, NULL);
} else {
- ulint zip_size;
+ ulint space;
+ ulint zip_size;
+ trx_rseg_t* rseg = NULL;
+
+ ut_a(!trx_rseg_get_on_id(i));
space = trx_sysf_rseg_get_space(sys_header, i, mtr);
zip_size = space ? fil_space_get_zip_size(space) : 0;
- trx_rseg_mem_create(i, space, zip_size, page_no, mtr);
+ rseg = trx_rseg_mem_create(
+ i, space, zip_size, page_no, mtr);
+
+ ut_a(rseg->id == i);
}
}
}
-/****************************************************************//**
-Creates a new rollback segment to the database.
-@return the created segment object, NULL if fail */
+/*********************************************************************
+Creates a rollback segment.
+@return pointer to new rollback segment if create successful */
UNIV_INTERN
trx_rseg_t*
-trx_rseg_create(
-/*============*/
- ulint space, /*!< in: space id */
- ulint max_size, /*!< in: max size in pages */
- ulint* id, /*!< out: rseg id */
- mtr_t* mtr) /*!< in: mtr */
+trx_rseg_create(void)
+/*=================*/
{
- ulint flags;
- ulint zip_size;
- ulint page_no;
- trx_rseg_t* rseg;
+ mtr_t mtr;
+ ulint slot_no;
+ trx_rseg_t* rseg = NULL;
+
+ mtr_start(&mtr);
+
+ /* To obey the latching order, acquire the file space
+ x-latch before the kernel mutex. */
+ mtr_x_lock(fil_space_get_latch(TRX_SYS_SPACE, NULL), &mtr);
- mtr_x_lock(fil_space_get_latch(space, &flags), mtr);
- zip_size = dict_table_flags_to_zip_size(flags);
mutex_enter(&kernel_mutex);
- page_no = trx_rseg_header_create(space, zip_size, max_size, id, mtr);
+ slot_no = trx_sysf_rseg_find_free(&mtr);
- if (page_no == FIL_NULL) {
+ if (slot_no != ULINT_UNDEFINED) {
+ ulint space;
+ ulint page_no;
+ ulint zip_size;
+ trx_sysf_t* sys_header;
- mutex_exit(&kernel_mutex);
- return(NULL);
- }
+ page_no = trx_rseg_header_create(
+ TRX_SYS_SPACE, 0, ULINT_MAX, slot_no, &mtr);
+
+ ut_a(page_no != FIL_NULL);
+
+ ut_ad(!trx_rseg_get_on_id(slot_no));
- rseg = trx_rseg_mem_create(*id, space, zip_size, page_no, mtr);
+ sys_header = trx_sysf_get(&mtr);
+
+ space = trx_sysf_rseg_get_space(sys_header, slot_no, &mtr);
+
+ zip_size = space ? fil_space_get_zip_size(space) : 0;
+
+ rseg = trx_rseg_mem_create(
+ slot_no, space, zip_size, page_no, &mtr);
+ }
mutex_exit(&kernel_mutex);
+ mtr_commit(&mtr);
return(rseg);
}
+
+/********************************************************************
+Initialize the rollback instance list. */
+UNIV_INTERN
+void
+trx_rseg_list_and_array_init(
+/*=========================*/
+ trx_sysf_t* sys_header, /* in: trx system header */
+ mtr_t* mtr) /* in: mtr */
+{
+ UT_LIST_INIT(trx_sys->rseg_list);
+
+ trx_sys->rseg_history_len = 0;
+
+ trx_rseg_create_instance(sys_header, mtr);
+}
+
diff --git a/storage/innobase/trx/trx0sys.c b/storage/innobase/trx/trx0sys.c
index 79e5af1c677..9c531e64662 100644
--- a/storage/innobase/trx/trx0sys.c
+++ b/storage/innobase/trx/trx0sys.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -39,6 +39,7 @@ Created 3/26/1996 Heikki Tuuri
#include "srv0srv.h"
#include "trx0purge.h"
#include "log0log.h"
+#include "log0recv.h"
#include "os0file.h"
#include "read0read.h"
@@ -126,6 +127,12 @@ static const char* file_format_name_map[] = {
static const ulint FILE_FORMAT_NAME_N
= sizeof(file_format_name_map) / sizeof(file_format_name_map[0]);
+#ifdef UNIV_PFS_MUTEX
+/* Key to register the mutex with performance schema */
+UNIV_INTERN mysql_pfs_key_t trx_doublewrite_mutex_key;
+UNIV_INTERN mysql_pfs_key_t file_format_max_mutex_key;
+#endif /* UNIV_PFS_MUTEX */
+
#ifndef UNIV_HOTBACKUP
/** This is used to track the maximum file format id known to InnoDB. It's
updated via SET GLOBAL innodb_file_format_check = 'x' or when we open
@@ -179,7 +186,8 @@ trx_doublewrite_init(
os_do_not_call_flush_at_each_write = TRUE;
#endif /* UNIV_DO_FLUSH */
- mutex_create(&trx_doublewrite->mutex, SYNC_DOUBLEWRITE);
+ mutex_create(trx_doublewrite_mutex_key,
+ &trx_doublewrite->mutex, SYNC_DOUBLEWRITE);
trx_doublewrite->first_free = 0;
@@ -584,8 +592,8 @@ trx_sys_doublewrite_init_or_restore_pages(
" recover the database"
" with the my.cnf\n"
"InnoDB: option:\n"
- "InnoDB: set-variable="
- "innodb_force_recovery=6\n");
+ "InnoDB:"
+ " innodb_force_recovery=6\n");
exit(1);
}
@@ -870,7 +878,8 @@ trx_sysf_create(
buf_block_t* block;
page_t* page;
ulint page_no;
- ulint i;
+ byte* ptr;
+ ulint len;
ut_ad(mtr);
@@ -903,32 +912,31 @@ trx_sysf_create(
sys_header = trx_sysf_get(mtr);
/* Start counting transaction ids from number 1 up */
- mlog_write_dulint(sys_header + TRX_SYS_TRX_ID_STORE,
- ut_dulint_create(0, 1), mtr);
+ mach_write_to_8(sys_header + TRX_SYS_TRX_ID_STORE,
+ ut_dulint_create(0, 1));
- /* Reset the rollback segment slots */
- for (i = 0; i < TRX_SYS_N_RSEGS; i++) {
+ /* Reset the rollback segment slots. Old versions of InnoDB
+ define TRX_SYS_N_RSEGS as 256 (TRX_SYS_OLD_N_RSEGS) and expect
+ that the whole array is initialized. */
+ ptr = TRX_SYS_RSEGS + sys_header;
+ len = ut_max(TRX_SYS_OLD_N_RSEGS, TRX_SYS_N_RSEGS)
+ * TRX_SYS_RSEG_SLOT_SIZE;
+ memset(ptr, 0xff, len);
+ ptr += len;
+ ut_a(ptr <= page + (UNIV_PAGE_SIZE - FIL_PAGE_DATA_END));
- trx_sysf_rseg_set_space(sys_header, i, ULINT_UNDEFINED, mtr);
- trx_sysf_rseg_set_page_no(sys_header, i, FIL_NULL, mtr);
- }
+ /* Initialize all of the page. This part used to be uninitialized. */
+ memset(ptr, 0, UNIV_PAGE_SIZE - FIL_PAGE_DATA_END + page - ptr);
- /* The remaining area (up to the page trailer) is uninitialized.
- Silence Valgrind warnings about it. */
- UNIV_MEM_VALID(sys_header + (TRX_SYS_RSEGS
- + TRX_SYS_N_RSEGS * TRX_SYS_RSEG_SLOT_SIZE
- + TRX_SYS_RSEG_SPACE),
- (UNIV_PAGE_SIZE - FIL_PAGE_DATA_END
- - (TRX_SYS_RSEGS
- + TRX_SYS_N_RSEGS * TRX_SYS_RSEG_SLOT_SIZE
- + TRX_SYS_RSEG_SPACE))
- + page - sys_header);
+ mlog_log_string(sys_header, UNIV_PAGE_SIZE - FIL_PAGE_DATA_END
+ + page - sys_header, mtr);
/* Create the first rollback segment in the SYSTEM tablespace */
- page_no = trx_rseg_header_create(TRX_SYS_SPACE, 0, ULINT_MAX, &slot_no,
+ slot_no = trx_sysf_rseg_find_free(mtr);
+ page_no = trx_rseg_header_create(TRX_SYS_SPACE, 0, ULINT_MAX, slot_no,
mtr);
ut_a(slot_no == TRX_SYS_SYSTEM_RSEG_ID);
- ut_a(page_no != FIL_NULL);
+ ut_a(page_no == FSP_FIRST_RSEG_PAGE_NO);
mutex_exit(&kernel_mutex);
}
@@ -1283,7 +1291,8 @@ void
trx_sys_file_format_init(void)
/*==========================*/
{
- mutex_create(&file_format_max.mutex, SYNC_FILE_FORMAT_TAG);
+ mutex_create(file_format_max_mutex_key,
+ &file_format_max.mutex, SYNC_FILE_FORMAT_TAG);
/* We don't need a mutex here, as this function should only
be called once at start up. */
@@ -1302,6 +1311,40 @@ trx_sys_file_format_close(void)
{
/* Does nothing at the moment */
}
+
+/*********************************************************************
+Creates the rollback segments */
+UNIV_INTERN
+void
+trx_sys_create_rsegs(
+/*=================*/
+ ulint n_rsegs) /*!< number of rollback segments to create */
+{
+ ulint new_rsegs = 0;
+
+ /* Do not create additional rollback segments if
+ innodb_force_recovery has been set and the database
+ was not shutdown cleanly. */
+ if (!srv_force_recovery && !recv_needed_recovery) {
+ ulint i;
+
+ for (i = 0; i < n_rsegs; ++i) {
+
+ if (trx_rseg_create() != NULL) {
+ ++new_rsegs;
+ } else {
+ break;
+ }
+ }
+ }
+
+ if (new_rsegs > 0) {
+ fprintf(stderr,
+ "InnoDB: %lu rollback segment(s) active.\n",
+ new_rsegs);
+ }
+}
+
#else /* !UNIV_HOTBACKUP */
/*****************************************************************//**
Prints to stderr the MySQL binlog info in the system header if the
@@ -1376,8 +1419,9 @@ trx_sys_read_file_format_id(
dulint file_format_id;
*format_id = ULINT_UNDEFINED;
-
+
file = os_file_create_simple_no_error_handling(
+ innodb_file_data_key,
pathname,
OS_FILE_OPEN,
OS_FILE_READ_ONLY,
@@ -1456,8 +1500,9 @@ trx_sys_read_pertable_file_format_id(
ib_uint32_t flags;
*format_id = ULINT_UNDEFINED;
-
+
file = os_file_create_simple_no_error_handling(
+ innodb_file_data_key,
pathname,
OS_FILE_OPEN,
OS_FILE_READ_ONLY,
@@ -1535,6 +1580,7 @@ trx_sys_file_format_id_to_name(
#endif /* !UNIV_HOTBACKUP */
+#ifndef UNIV_HOTBACKUP
/*********************************************************************
Shutdown/Close the transaction system. */
UNIV_INTERN
@@ -1611,3 +1657,4 @@ trx_sys_close(void)
trx_sys = NULL;
mutex_exit(&kernel_mutex);
}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/trx/trx0trx.c b/storage/innobase/trx/trx0trx.c
index 0951b98b79f..a47fc28c199 100644
--- a/storage/innobase/trx/trx0trx.c
+++ b/storage/innobase/trx/trx0trx.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -51,6 +51,11 @@ UNIV_INTERN sess_t* trx_dummy_sess = NULL;
the kernel mutex */
UNIV_INTERN ulint trx_n_mysql_transactions = 0;
+#ifdef UNIV_PFS_MUTEX
+/* Key to register the mutex with performance schema */
+UNIV_INTERN mysql_pfs_key_t trx_undo_mutex_key;
+#endif /* UNIV_PFS_MUTEX */
+
/*************************************************************//**
Set detailed error message for the transaction. */
UNIV_INTERN
@@ -129,7 +134,7 @@ trx_create(
trx->mysql_log_file_name = NULL;
trx->mysql_log_offset = 0;
- mutex_create(&trx->undo_mutex, SYNC_TRX_UNDO);
+ mutex_create(trx_undo_mutex_key, &trx->undo_mutex, SYNC_TRX_UNDO);
trx->rseg = NULL;
@@ -425,6 +430,7 @@ trx_lists_init_at_db_start(void)
trx_undo_t* undo;
trx_t* trx;
+ ut_ad(mutex_own(&kernel_mutex));
UT_LIST_INIT(trx_sys->trx_list);
/* Look from the rollback segments if there exist undo logs for
diff --git a/storage/innobase/ut/ut0rbt.c b/storage/innobase/ut/ut0rbt.c
new file mode 100644
index 00000000000..3279307308f
--- /dev/null
+++ b/storage/innobase/ut/ut0rbt.c
@@ -0,0 +1,1231 @@
+/**********************************************************************
+Red-Black tree implementation
+
+(c) 2007 Oracle/Innobase Oy
+
+Created 2007-03-20 Sunny Bains
+***********************************************************************/
+
+#include "ut0rbt.h"
+
+/************************************************************************
+Definition of a red-black tree
+==============================
+
+A red-black tree is a binary search tree which has the following
+red-black properties:
+
+ 1. Every node is either red or black.
+ 2. Every leaf (NULL - in our case tree->nil) is black.
+ 3. If a node is red, then both its children are black.
+ 4. Every simple path from a node to a descendant leaf contains the
+ same number of black nodes.
+
+ from (3) above, the implication is that on any path from the root
+ to a leaf, red nodes must not be adjacent.
+
+ However, any number of black nodes may appear in a sequence.
+ */
+
+#if defined(IB_RBT_TESTING)
+#warning "Testing enabled!"
+#endif
+
+#define ROOT(t) (t->root->left)
+#define SIZEOF_NODE(t) ((sizeof(ib_rbt_node_t) + t->sizeof_value) - 1)
+
+/************************************************************************
+Print out the sub-tree recursively. */
+static
+void
+rbt_print_subtree(
+/*==============*/
+ const ib_rbt_t* tree, /*!< in: tree to traverse */
+ const ib_rbt_node_t* node, /*!< in: node to print */
+ ib_rbt_print_node print) /*!< in: print key function */
+{
+ /* FIXME: Doesn't do anything yet */
+ if (node != tree->nil) {
+ print(node);
+ rbt_print_subtree(tree, node->left, print);
+ rbt_print_subtree(tree, node->right, print);
+ }
+}
+
+/************************************************************************
+Verify that the keys are in order.
+@return TRUE of OK. FALSE if not ordered */
+static
+ibool
+rbt_check_ordering(
+/*===============*/
+ const ib_rbt_t* tree) /*!< in: tree to verfify */
+{
+ const ib_rbt_node_t* node;
+ const ib_rbt_node_t* prev = NULL;
+
+ /* Iterate over all the nodes, comparing each node with the prev */
+ for (node = rbt_first(tree); node; node = rbt_next(tree, prev)) {
+
+ if (prev && tree->compare(prev->value, node->value) >= 0) {
+ return(FALSE);
+ }
+
+ prev = node;
+ }
+
+ return(TRUE);
+}
+
+/************************************************************************
+Check that every path from the root to the leaves has the same count.
+Count is expressed in the number of black nodes.
+@return 0 on failure else black height of the subtree */
+static
+ibool
+rbt_count_black_nodes(
+/*==================*/
+ const ib_rbt_t* tree, /*!< in: tree to verify */
+ const ib_rbt_node_t* node) /*!< in: start of sub-tree */
+{
+ ulint result;
+
+ if (node != tree->nil) {
+ ulint left_height = rbt_count_black_nodes(tree, node->left);
+
+ ulint right_height = rbt_count_black_nodes(tree, node->right);
+
+ if (left_height == 0
+ || right_height == 0
+ || left_height != right_height) {
+
+ result = 0;
+ } else if (node->color == IB_RBT_RED) {
+
+ /* Case 3 */
+ if (node->left->color != IB_RBT_BLACK
+ || node->right->color != IB_RBT_BLACK) {
+
+ result = 0;
+ } else {
+ result = left_height;
+ }
+ /* Check if it's anything other than RED or BLACK. */
+ } else if (node->color != IB_RBT_BLACK) {
+
+ result = 0;
+ } else {
+
+ result = right_height + 1;
+ }
+ } else {
+ result = 1;
+ }
+
+ return(result);
+}
+
+/************************************************************************
+Turn the node's right child's left sub-tree into node's right sub-tree.
+This will also make node's right child it's parent. */
+static
+void
+rbt_rotate_left(
+/*============*/
+ const ib_rbt_node_t* nil, /*!< in: nil node of the tree */
+ ib_rbt_node_t* node) /*!< in: node to rotate */
+{
+ ib_rbt_node_t* right = node->right;
+
+ node->right = right->left;
+
+ if (right->left != nil) {
+ right->left->parent = node;
+ }
+
+ /* Right's new parent was node's parent. */
+ right->parent = node->parent;
+
+ /* Since root's parent is tree->nil and root->parent->left points
+ back to root, we can avoid the check. */
+ if (node == node->parent->left) {
+ /* Node was on the left of its parent. */
+ node->parent->left = right;
+ } else {
+ /* Node must have been on the right. */
+ node->parent->right = right;
+ }
+
+ /* Finally, put node on right's left. */
+ right->left = node;
+ node->parent = right;
+}
+
+/************************************************************************
+Turn the node's left child's right sub-tree into node's left sub-tree.
+This also make node's left child it's parent. */
+static
+void
+rbt_rotate_right(
+/*=============*/
+ const ib_rbt_node_t* nil, /*!< in: nil node of tree */
+ ib_rbt_node_t* node) /*!< in: node to rotate */
+{
+ ib_rbt_node_t* left = node->left;
+
+ node->left = left->right;
+
+ if (left->right != nil) {
+ left->right->parent = node;
+ }
+
+ /* Left's new parent was node's parent. */
+ left->parent = node->parent;
+
+ /* Since root's parent is tree->nil and root->parent->left points
+ back to root, we can avoid the check. */
+ if (node == node->parent->right) {
+ /* Node was on the left of its parent. */
+ node->parent->right = left;
+ } else {
+ /* Node must have been on the left. */
+ node->parent->left = left;
+ }
+
+ /* Finally, put node on left's right. */
+ left->right = node;
+ node->parent = left;
+}
+
+/************************************************************************
+Append a node to the tree. */
+static
+ib_rbt_node_t*
+rbt_tree_add_child(
+/*===============*/
+ const ib_rbt_t* tree,
+ ib_rbt_bound_t* parent,
+ ib_rbt_node_t* node)
+{
+ /* Cast away the const. */
+ ib_rbt_node_t* last = (ib_rbt_node_t*) parent->last;
+
+ if (last == tree->root || parent->result < 0) {
+ last->left = node;
+ } else {
+ /* FIXME: We don't handle duplicates (yet)! */
+ ut_a(parent->result != 0);
+
+ last->right = node;
+ }
+
+ node->parent = last;
+
+ return(node);
+}
+
+/************************************************************************
+Generic binary tree insert */
+static
+ib_rbt_node_t*
+rbt_tree_insert(
+/*============*/
+ ib_rbt_t* tree,
+ const void* key,
+ ib_rbt_node_t* node)
+{
+ ib_rbt_bound_t parent;
+ ib_rbt_node_t* current = ROOT(tree);
+
+ parent.result = 0;
+ parent.last = tree->root;
+
+ /* Regular binary search. */
+ while (current != tree->nil) {
+
+ parent.last = current;
+ parent.result = tree->compare(key, current->value);
+
+ if (parent.result < 0) {
+ current = current->left;
+ } else {
+ current = current->right;
+ }
+ }
+
+ ut_a(current == tree->nil);
+
+ rbt_tree_add_child(tree, &parent, node);
+
+ return(node);
+}
+
+/************************************************************************
+Balance a tree after inserting a node. */
+static
+void
+rbt_balance_tree(
+/*=============*/
+ const ib_rbt_t* tree, /*!< in: tree to balance */
+ ib_rbt_node_t* node) /*!< in: node that was inserted */
+{
+ const ib_rbt_node_t* nil = tree->nil;
+ ib_rbt_node_t* parent = node->parent;
+
+ /* Restore the red-black property. */
+ node->color = IB_RBT_RED;
+
+ while (node != ROOT(tree) && parent->color == IB_RBT_RED) {
+ ib_rbt_node_t* grand_parent = parent->parent;
+
+ if (parent == grand_parent->left) {
+ ib_rbt_node_t* uncle = grand_parent->right;
+
+ if (uncle->color == IB_RBT_RED) {
+
+ /* Case 1 - change the colors. */
+ uncle->color = IB_RBT_BLACK;
+ parent->color = IB_RBT_BLACK;
+ grand_parent->color = IB_RBT_RED;
+
+ /* Move node up the tree. */
+ node = grand_parent;
+
+ } else {
+
+ if (node == parent->right) {
+ /* Right is a black node and node is
+ to the right, case 2 - move node
+ up and rotate. */
+ node = parent;
+ rbt_rotate_left(nil, node);
+ }
+
+ grand_parent = node->parent->parent;
+
+ /* Case 3. */
+ node->parent->color = IB_RBT_BLACK;
+ grand_parent->color = IB_RBT_RED;
+
+ rbt_rotate_right(nil, grand_parent);
+ }
+
+ } else {
+ ib_rbt_node_t* uncle = grand_parent->left;
+
+ if (uncle->color == IB_RBT_RED) {
+
+ /* Case 1 - change the colors. */
+ uncle->color = IB_RBT_BLACK;
+ parent->color = IB_RBT_BLACK;
+ grand_parent->color = IB_RBT_RED;
+
+ /* Move node up the tree. */
+ node = grand_parent;
+
+ } else {
+
+ if (node == parent->left) {
+ /* Left is a black node and node is to
+ the right, case 2 - move node up and
+ rotate. */
+ node = parent;
+ rbt_rotate_right(nil, node);
+ }
+
+ grand_parent = node->parent->parent;
+
+ /* Case 3. */
+ node->parent->color = IB_RBT_BLACK;
+ grand_parent->color = IB_RBT_RED;
+
+ rbt_rotate_left(nil, grand_parent);
+ }
+ }
+
+ parent = node->parent;
+ }
+
+ /* Color the root black. */
+ ROOT(tree)->color = IB_RBT_BLACK;
+}
+
+/************************************************************************
+Find the given node's successor.
+@return successor node or NULL if no successor */
+static
+ib_rbt_node_t*
+rbt_find_successor(
+/*===============*/
+ const ib_rbt_t* tree, /*!< in: rb tree */
+ const ib_rbt_node_t* current) /*!< in: this is declared const
+ because it can be called via
+ rbt_next() */
+{
+ const ib_rbt_node_t* nil = tree->nil;
+ ib_rbt_node_t* next = current->right;
+
+ /* Is there a sub-tree to the right that we can follow. */
+ if (next != nil) {
+
+ /* Follow the left most links of the current right child. */
+ while (next->left != nil) {
+ next = next->left;
+ }
+
+ } else { /* We will have to go up the tree to find the successor. */
+ ib_rbt_node_t* parent = current->parent;
+
+ /* Cast away the const. */
+ next = (ib_rbt_node_t*) current;
+
+ while (parent != tree->root && next == parent->right) {
+ next = parent;
+ parent = next->parent;
+ }
+
+ next = (parent == tree->root) ? NULL : parent;
+ }
+
+ return(next);
+}
+
+/************************************************************************
+Find the given node's precedecessor.
+@return predecessor node or NULL if no predecesor */
+static
+ib_rbt_node_t*
+rbt_find_predecessor(
+/*=================*/
+ const ib_rbt_t* tree, /*!< in: rb tree */
+ const ib_rbt_node_t* current) /*!< in: this is declared const
+ because it can be called via
+ rbt_prev() */
+{
+ const ib_rbt_node_t* nil = tree->nil;
+ ib_rbt_node_t* prev = current->left;
+
+ /* Is there a sub-tree to the left that we can follow. */
+ if (prev != nil) {
+
+ /* Follow the right most links of the current left child. */
+ while (prev->right != nil) {
+ prev = prev->right;
+ }
+
+ } else { /* We will have to go up the tree to find the precedecessor. */
+ ib_rbt_node_t* parent = current->parent;
+
+ /* Cast away the const. */
+ prev = (ib_rbt_node_t*)current;
+
+ while (parent != tree->root && prev == parent->left) {
+ prev = parent;
+ parent = prev->parent;
+ }
+
+ prev = (parent == tree->root) ? NULL : parent;
+ }
+
+ return(prev);
+}
+
+/************************************************************************
+Replace node with child. After applying transformations eject becomes
+an orphan. */
+static
+void
+rbt_eject_node(
+/*===========*/
+ ib_rbt_node_t* eject, /*!< in: node to eject */
+ ib_rbt_node_t* node) /*!< in: node to replace with */
+{
+ /* Update the to be ejected node's parent's child pointers. */
+ if (eject->parent->left == eject) {
+ eject->parent->left = node;
+ } else if (eject->parent->right == eject) {
+ eject->parent->right = node;
+ } else {
+ ut_a(0);
+ }
+ /* eject is now an orphan but otherwise its pointers
+ and color are left intact. */
+
+ node->parent = eject->parent;
+}
+
+/************************************************************************
+Replace a node with another node. */
+static
+void
+rbt_replace_node(
+/*=============*/
+ ib_rbt_node_t* replace, /*!< in: node to replace */
+ ib_rbt_node_t* node) /*!< in: node to replace with */
+{
+ ib_rbt_color_t color = node->color;
+
+ /* Update the node pointers. */
+ node->left = replace->left;
+ node->right = replace->right;
+
+ /* Update the child node pointers. */
+ node->left->parent = node;
+ node->right->parent = node;
+
+ /* Make the parent of replace point to node. */
+ rbt_eject_node(replace, node);
+
+ /* Swap the colors. */
+ node->color = replace->color;
+ replace->color = color;
+}
+
+/************************************************************************
+Detach node from the tree replacing it with one of it's children.
+@return the child node that now occupies the position of the detached node */
+static
+ib_rbt_node_t*
+rbt_detach_node(
+/*============*/
+ const ib_rbt_t* tree, /*!< in: rb tree */
+ ib_rbt_node_t* node) /*!< in: node to detach */
+{
+ ib_rbt_node_t* child;
+ const ib_rbt_node_t* nil = tree->nil;
+
+ if (node->left != nil && node->right != nil) {
+ /* Case where the node to be deleted has two children. */
+ ib_rbt_node_t* successor = rbt_find_successor(tree, node);
+
+ ut_a(successor != nil);
+ ut_a(successor->parent != nil);
+ ut_a(successor->left == nil);
+
+ child = successor->right;
+
+ /* Remove the successor node and replace with its child. */
+ rbt_eject_node(successor, child);
+
+ /* Replace the node to delete with its successor node. */
+ rbt_replace_node(node, successor);
+ } else {
+ ut_a(node->left == nil || node->right == nil);
+
+ child = (node->left != nil) ? node->left : node->right;
+
+ /* Replace the node to delete with one of it's children. */
+ rbt_eject_node(node, child);
+ }
+
+ /* Reset the node links. */
+ node->parent = node->right = node->left = tree->nil;
+
+ return(child);
+}
+
+/************************************************************************
+Rebalance the right sub-tree after deletion.
+@return node to rebalance if more rebalancing required else NULL */
+static
+ib_rbt_node_t*
+rbt_balance_right(
+/*==============*/
+ const ib_rbt_node_t* nil, /*!< in: rb tree nil node */
+ ib_rbt_node_t* parent, /*!< in: parent node */
+ ib_rbt_node_t* sibling) /*!< in: sibling node */
+{
+ ib_rbt_node_t* node = NULL;
+
+ ut_a(sibling != nil);
+
+ /* Case 3. */
+ if (sibling->color == IB_RBT_RED) {
+
+ parent->color = IB_RBT_RED;
+ sibling->color = IB_RBT_BLACK;
+
+ rbt_rotate_left(nil, parent);
+
+ sibling = parent->right;
+
+ ut_a(sibling != nil);
+ }
+
+ /* Since this will violate case 3 because of the change above. */
+ if (sibling->left->color == IB_RBT_BLACK
+ && sibling->right->color == IB_RBT_BLACK) {
+
+ node = parent; /* Parent needs to be rebalanced too. */
+ sibling->color = IB_RBT_RED;
+
+ } else {
+ if (sibling->right->color == IB_RBT_BLACK) {
+
+ ut_a(sibling->left->color == IB_RBT_RED);
+
+ sibling->color = IB_RBT_RED;
+ sibling->left->color = IB_RBT_BLACK;
+
+ rbt_rotate_right(nil, sibling);
+
+ sibling = parent->right;
+ ut_a(sibling != nil);
+ }
+
+ sibling->color = parent->color;
+ sibling->right->color = IB_RBT_BLACK;
+
+ parent->color = IB_RBT_BLACK;
+
+ rbt_rotate_left(nil, parent);
+ }
+
+ return(node);
+}
+
+/************************************************************************
+Rebalance the left sub-tree after deletion.
+@return node to rebalance if more rebalancing required else NULL */
+static
+ib_rbt_node_t*
+rbt_balance_left(
+/*=============*/
+ const ib_rbt_node_t* nil, /*!< in: rb tree nil node */
+ ib_rbt_node_t* parent, /*!< in: parent node */
+ ib_rbt_node_t* sibling) /*!< in: sibling node */
+{
+ ib_rbt_node_t* node = NULL;
+
+ ut_a(sibling != nil);
+
+ /* Case 3. */
+ if (sibling->color == IB_RBT_RED) {
+
+ parent->color = IB_RBT_RED;
+ sibling->color = IB_RBT_BLACK;
+
+ rbt_rotate_right(nil, parent);
+ sibling = parent->left;
+
+ ut_a(sibling != nil);
+ }
+
+ /* Since this will violate case 3 because of the change above. */
+ if (sibling->right->color == IB_RBT_BLACK
+ && sibling->left->color == IB_RBT_BLACK) {
+
+ node = parent; /* Parent needs to be rebalanced too. */
+ sibling->color = IB_RBT_RED;
+
+ } else {
+ if (sibling->left->color == IB_RBT_BLACK) {
+
+ ut_a(sibling->right->color == IB_RBT_RED);
+
+ sibling->color = IB_RBT_RED;
+ sibling->right->color = IB_RBT_BLACK;
+
+ rbt_rotate_left(nil, sibling);
+
+ sibling = parent->left;
+
+ ut_a(sibling != nil);
+ }
+
+ sibling->color = parent->color;
+ sibling->left->color = IB_RBT_BLACK;
+
+ parent->color = IB_RBT_BLACK;
+
+ rbt_rotate_right(nil, parent);
+ }
+
+ return(node);
+}
+
+/************************************************************************
+Delete the node and rebalance the tree if necessary */
+static
+void
+rbt_remove_node_and_rebalance(
+/*==========================*/
+ ib_rbt_t* tree, /*!< in: rb tree */
+ ib_rbt_node_t* node) /*!< in: node to remove */
+{
+ /* Detach node and get the node that will be used
+ as rebalance start. */
+ ib_rbt_node_t* child = rbt_detach_node(tree, node);
+
+ if (node->color == IB_RBT_BLACK) {
+ ib_rbt_node_t* last = child;
+
+ ROOT(tree)->color = IB_RBT_RED;
+
+ while (child && child->color == IB_RBT_BLACK) {
+ ib_rbt_node_t* parent = child->parent;
+
+ /* Did the deletion cause an imbalance in the
+ parents left sub-tree. */
+ if (parent->left == child) {
+
+ child = rbt_balance_right(
+ tree->nil, parent, parent->right);
+
+ } else if (parent->right == child) {
+
+ child = rbt_balance_left(
+ tree->nil, parent, parent->left);
+
+ } else {
+ ut_error;
+ }
+
+ if (child) {
+ last = child;
+ }
+ }
+
+ ut_a(last);
+
+ last->color = IB_RBT_BLACK;
+ ROOT(tree)->color = IB_RBT_BLACK;
+ }
+
+ /* Note that we have removed a node from the tree. */
+ --tree->n_nodes;
+}
+
+/************************************************************************
+Recursively free the nodes. */
+static
+void
+rbt_free_node(
+/*==========*/
+ ib_rbt_node_t* node, /*!< in: node to free */
+ ib_rbt_node_t* nil) /*!< in: rb tree nil node */
+{
+ if (node != nil) {
+ rbt_free_node(node->left, nil);
+ rbt_free_node(node->right, nil);
+
+ ut_free(node);
+ }
+}
+
+/************************************************************************
+Free all the nodes and free the tree. */
+UNIV_INTERN
+void
+rbt_free(
+/*=====*/
+ ib_rbt_t* tree) /*!< in: rb tree to free */
+{
+ rbt_free_node(tree->root, tree->nil);
+ ut_free(tree->nil);
+ ut_free(tree);
+}
+
+/************************************************************************
+Create an instance of a red black tree.
+@return an empty rb tree */
+UNIV_INTERN
+ib_rbt_t*
+rbt_create(
+/*=======*/
+ size_t sizeof_value, /*!< in: sizeof data item */
+ ib_rbt_compare compare) /*!< in: fn to compare items */
+{
+ ib_rbt_t* tree;
+ ib_rbt_node_t* node;
+
+ tree = (ib_rbt_t*) ut_malloc(sizeof(*tree));
+ memset(tree, 0, sizeof(*tree));
+
+ tree->sizeof_value = sizeof_value;
+
+ /* Create the sentinel (NIL) node. */
+ node = tree->nil = (ib_rbt_node_t*) ut_malloc(sizeof(*node));
+ memset(node, 0, sizeof(*node));
+
+ node->color = IB_RBT_BLACK;
+ node->parent = node->left = node->right = node;
+
+ /* Create the "fake" root, the real root node will be the
+ left child of this node. */
+ node = tree->root = (ib_rbt_node_t*) ut_malloc(sizeof(*node));
+ memset(node, 0, sizeof(*node));
+
+ node->color = IB_RBT_BLACK;
+ node->parent = node->left = node->right = tree->nil;
+
+ tree->compare = compare;
+
+ return(tree);
+}
+
+/************************************************************************
+Generic insert of a value in the rb tree.
+@return inserted node */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_insert(
+/*=======*/
+ ib_rbt_t* tree, /*!< in: rb tree */
+ const void* key, /*!< in: key for ordering */
+ const void* value) /*!< in: value of key, this value
+ is copied to the node */
+{
+ ib_rbt_node_t* node;
+
+ /* Create the node that will hold the value data. */
+ node = (ib_rbt_node_t*) ut_malloc(SIZEOF_NODE(tree));
+
+ memcpy(node->value, value, tree->sizeof_value);
+ node->parent = node->left = node->right = tree->nil;
+
+ /* Insert in the tree in the usual way. */
+ rbt_tree_insert(tree, key, node);
+ rbt_balance_tree(tree, node);
+
+ ++tree->n_nodes;
+
+ return(node);
+}
+
+/************************************************************************
+Add a new node to the tree, useful for data that is pre-sorted.
+@return appended node */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_add_node(
+/*=========*/
+ ib_rbt_t* tree, /*!< in: rb tree */
+ ib_rbt_bound_t* parent, /*!< in: bounds */
+ const void* value) /*!< in: this value is copied
+ to the node */
+{
+ ib_rbt_node_t* node;
+
+ /* Create the node that will hold the value data */
+ node = (ib_rbt_node_t*) ut_malloc(SIZEOF_NODE(tree));
+
+ memcpy(node->value, value, tree->sizeof_value);
+ node->parent = node->left = node->right = tree->nil;
+
+ /* If tree is empty */
+ if (parent->last == NULL) {
+ parent->last = tree->root;
+ }
+
+ /* Append the node, the hope here is that the caller knows
+ what s/he is doing. */
+ rbt_tree_add_child(tree, parent, node);
+ rbt_balance_tree(tree, node);
+
+ ++tree->n_nodes;
+
+#if defined(IB_RBT_TESTING)
+ ut_a(rbt_validate(tree));
+#endif
+ return(node);
+}
+
+/************************************************************************
+Find a matching node in the rb tree.
+@return NULL if not found else the node where key was found */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_lookup(
+/*=======*/
+ const ib_rbt_t* tree, /*!< in: rb tree */
+ const void* key) /*!< in: key to use for search */
+{
+ const ib_rbt_node_t* current = ROOT(tree);
+
+ /* Regular binary search. */
+ while (current != tree->nil) {
+ int result = tree->compare(key, current->value);
+
+ if (result < 0) {
+ current = current->left;
+ } else if (result > 0) {
+ current = current->right;
+ } else {
+ break;
+ }
+ }
+
+ return(current != tree->nil ? current : NULL);
+}
+
+/************************************************************************
+Delete a node indentified by key.
+@return TRUE if success FALSE if not found */
+UNIV_INTERN
+ibool
+rbt_delete(
+/*=======*/
+ ib_rbt_t* tree, /*!< in: rb tree */
+ const void* key) /*!< in: key to delete */
+{
+ ibool deleted = FALSE;
+ ib_rbt_node_t* node = (ib_rbt_node_t*) rbt_lookup(tree, key);
+
+ if (node) {
+ rbt_remove_node_and_rebalance(tree, node);
+
+ ut_free(node);
+ deleted = TRUE;
+ }
+
+ return(deleted);
+}
+
+/************************************************************************
+Remove a node from the rb tree, the node is not free'd, that is the
+callers responsibility.
+@return deleted node but without the const */
+UNIV_INTERN
+ib_rbt_node_t*
+rbt_remove_node(
+/*============*/
+ ib_rbt_t* tree, /*!< in: rb tree */
+ const ib_rbt_node_t* const_node) /*!< in: node to delete, this
+ is a fudge and declared const
+ because the caller can access
+ only const nodes */
+{
+ /* Cast away the const. */
+ rbt_remove_node_and_rebalance(tree, (ib_rbt_node_t*) const_node);
+
+ /* This is to make it easier to do something like this:
+ ut_free(rbt_remove_node(node));
+ */
+
+ return((ib_rbt_node_t*) const_node);
+}
+
+/************************************************************************
+Find the node that has the lowest key that is >= key.
+@return node satisfying the lower bound constraint or NULL */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_lower_bound(
+/*============*/
+ const ib_rbt_t* tree, /*!< in: rb tree */
+ const void* key) /*!< in: key to search */
+{
+ ib_rbt_node_t* lb_node = NULL;
+ ib_rbt_node_t* current = ROOT(tree);
+
+ while (current != tree->nil) {
+ int result = tree->compare(key, current->value);
+
+ if (result > 0) {
+
+ current = current->right;
+
+ } else if (result < 0) {
+
+ lb_node = current;
+ current = current->left;
+
+ } else {
+ lb_node = current;
+ break;
+ }
+ }
+
+ return(lb_node);
+}
+
+/************************************************************************
+Find the node that has the greatest key that is <= key.
+@return node satisfying the upper bound constraint or NULL */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_upper_bound(
+/*============*/
+ const ib_rbt_t* tree, /*!< in: rb tree */
+ const void* key) /*!< in: key to search */
+{
+ ib_rbt_node_t* ub_node = NULL;
+ ib_rbt_node_t* current = ROOT(tree);
+
+ while (current != tree->nil) {
+ int result = tree->compare(key, current->value);
+
+ if (result > 0) {
+
+ ub_node = current;
+ current = current->right;
+
+ } else if (result < 0) {
+
+ current = current->left;
+
+ } else {
+ ub_node = current;
+ break;
+ }
+ }
+
+ return(ub_node);
+}
+
+/************************************************************************
+Find the node that has the greatest key that is <= key.
+@return value of result */
+UNIV_INTERN
+int
+rbt_search(
+/*=======*/
+ const ib_rbt_t* tree, /*!< in: rb tree */
+ ib_rbt_bound_t* parent, /*!< in: search bounds */
+ const void* key) /*!< in: key to search */
+{
+ ib_rbt_node_t* current = ROOT(tree);
+
+ /* Every thing is greater than the NULL root. */
+ parent->result = 1;
+ parent->last = NULL;
+
+ while (current != tree->nil) {
+
+ parent->last = current;
+ parent->result = tree->compare(key, current->value);
+
+ if (parent->result > 0) {
+ current = current->right;
+ } else if (parent->result < 0) {
+ current = current->left;
+ } else {
+ break;
+ }
+ }
+
+ return(parent->result);
+}
+
+/************************************************************************
+Find the node that has the greatest key that is <= key. But use the
+supplied comparison function.
+@return value of result */
+UNIV_INTERN
+int
+rbt_search_cmp(
+/*===========*/
+ const ib_rbt_t* tree, /*!< in: rb tree */
+ ib_rbt_bound_t* parent, /*!< in: search bounds */
+ const void* key, /*!< in: key to search */
+ ib_rbt_compare compare) /*!< in: fn to compare items */
+{
+ ib_rbt_node_t* current = ROOT(tree);
+
+ /* Every thing is greater than the NULL root. */
+ parent->result = 1;
+ parent->last = NULL;
+
+ while (current != tree->nil) {
+
+ parent->last = current;
+ parent->result = compare(key, current->value);
+
+ if (parent->result > 0) {
+ current = current->right;
+ } else if (parent->result < 0) {
+ current = current->left;
+ } else {
+ break;
+ }
+ }
+
+ return(parent->result);
+}
+
+/************************************************************************
+Return the left most node in the tree. */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_first(
+/*======*/
+ /* out leftmost node or NULL */
+ const ib_rbt_t* tree) /* in: rb tree */
+{
+ ib_rbt_node_t* first = NULL;
+ ib_rbt_node_t* current = ROOT(tree);
+
+ while (current != tree->nil) {
+ first = current;
+ current = current->left;
+ }
+
+ return(first);
+}
+
+/************************************************************************
+Return the right most node in the tree.
+@return the rightmost node or NULL */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_last(
+/*=====*/
+ const ib_rbt_t* tree) /*!< in: rb tree */
+{
+ ib_rbt_node_t* last = NULL;
+ ib_rbt_node_t* current = ROOT(tree);
+
+ while (current != tree->nil) {
+ last = current;
+ current = current->right;
+ }
+
+ return(last);
+}
+
+/************************************************************************
+Return the next node.
+@return node next from current */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_next(
+/*=====*/
+ const ib_rbt_t* tree, /*!< in: rb tree */
+ const ib_rbt_node_t* current) /*!< in: current node */
+{
+ return(current ? rbt_find_successor(tree, current) : NULL);
+}
+
+/************************************************************************
+Return the previous node.
+@return node prev from current */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_prev(
+/*=====*/
+ const ib_rbt_t* tree, /*!< in: rb tree */
+ const ib_rbt_node_t* current) /*!< in: current node */
+{
+ return(current ? rbt_find_predecessor(tree, current) : NULL);
+}
+
+/************************************************************************
+Reset the tree. Delete all the nodes. */
+UNIV_INTERN
+void
+rbt_clear(
+/*======*/
+ ib_rbt_t* tree) /*!< in: rb tree */
+{
+ rbt_free_node(ROOT(tree), tree->nil);
+
+ tree->n_nodes = 0;
+ tree->root->left = tree->root->right = tree->nil;
+}
+
+/************************************************************************
+Merge the node from dst into src. Return the number of nodes merged.
+@return no. of recs merged */
+UNIV_INTERN
+ulint
+rbt_merge_uniq(
+/*===========*/
+ ib_rbt_t* dst, /*!< in: dst rb tree */
+ const ib_rbt_t* src) /*!< in: src rb tree */
+{
+ ib_rbt_bound_t parent;
+ ulint n_merged = 0;
+ const ib_rbt_node_t* src_node = rbt_first(src);
+
+ if (rbt_empty(src) || dst == src) {
+ return(0);
+ }
+
+ for (/* No op */; src_node; src_node = rbt_next(src, src_node)) {
+
+ if (rbt_search(dst, &parent, src_node->value) != 0) {
+ rbt_add_node(dst, &parent, src_node->value);
+ ++n_merged;
+ }
+ }
+
+ return(n_merged);
+}
+
+/************************************************************************
+Merge the node from dst into src. Return the number of nodes merged.
+Delete the nodes from src after copying node to dst. As a side effect
+the duplicates will be left untouched in the src.
+@return no. of recs merged */
+UNIV_INTERN
+ulint
+rbt_merge_uniq_destructive(
+/*=======================*/
+ ib_rbt_t* dst, /*!< in: dst rb tree */
+ ib_rbt_t* src) /*!< in: src rb tree */
+{
+ ib_rbt_bound_t parent;
+ ib_rbt_node_t* src_node;
+ ulint old_size = rbt_size(dst);
+
+ if (rbt_empty(src) || dst == src) {
+ return(0);
+ }
+
+ for (src_node = (ib_rbt_node_t*) rbt_first(src); src_node; /* */) {
+ ib_rbt_node_t* prev = src_node;
+
+ src_node = (ib_rbt_node_t*)rbt_next(src, prev);
+
+ /* Skip duplicates. */
+ if (rbt_search(dst, &parent, prev->value) != 0) {
+
+ /* Remove and reset the node but preserve
+ the node (data) value. */
+ rbt_remove_node_and_rebalance(src, prev);
+
+ /* The nil should be taken from the dst tree. */
+ prev->parent = prev->left = prev->right = dst->nil;
+ rbt_tree_add_child(dst, &parent, prev);
+ rbt_balance_tree(dst, prev);
+
+ ++dst->n_nodes;
+ }
+ }
+
+#if defined(IB_RBT_TESTING)
+ ut_a(rbt_validate(dst));
+ ut_a(rbt_validate(src));
+#endif
+ return(rbt_size(dst) - old_size);
+}
+
+/************************************************************************
+Check that every path from the root to the leaves has the same count and
+the tree nodes are in order.
+@return TRUE if OK FALSE otherwise */
+UNIV_INTERN
+ibool
+rbt_validate(
+/*=========*/
+ const ib_rbt_t* tree) /*!< in: RB tree to validate */
+{
+ if (rbt_count_black_nodes(tree, ROOT(tree)) > 0) {
+ return(rbt_check_ordering(tree));
+ }
+
+ return(FALSE);
+}
+
+/************************************************************************
+Iterate over the tree in depth first order. */
+UNIV_INTERN
+void
+rbt_print(
+/*======*/
+ const ib_rbt_t* tree, /*!< in: tree to traverse */
+ ib_rbt_print_node print) /*!< in: print function */
+{
+ rbt_print_subtree(tree, ROOT(tree), print);
+}
diff --git a/storage/innobase/ut/ut0ut.c b/storage/innobase/ut/ut0ut.c
index 498873e290a..6b65067aa54 100644
--- a/storage/innobase/ut/ut0ut.c
+++ b/storage/innobase/ut/ut0ut.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
Copyright (c) 2009, Sun Microsystems, Inc.
Portions of this file contain modifications contributed and copyrighted by
@@ -623,3 +623,105 @@ ut_snprintf(
return(res);
}
#endif /* __WIN__ */
+
+/*************************************************************//**
+Convert an error number to a human readable text message. The
+returned string is static and should not be freed or modified.
+@return string, describing the error */
+UNIV_INTERN
+const char*
+ut_strerr(
+/*======*/
+ enum db_err num) /*!< in: error number */
+{
+ switch (num) {
+ case DB_SUCCESS:
+ return("Success");
+ case DB_ERROR:
+ return("Generic error");
+ case DB_INTERRUPTED:
+ return("Operation interrupted");
+ case DB_OUT_OF_MEMORY:
+ return("Cannot allocate memory");
+ case DB_OUT_OF_FILE_SPACE:
+ return("Out of disk space");
+ case DB_LOCK_WAIT:
+ return("Lock wait");
+ case DB_DEADLOCK:
+ return("Deadlock");
+ case DB_ROLLBACK:
+ return("Rollback");
+ case DB_DUPLICATE_KEY:
+ return("Duplicate key");
+ case DB_QUE_THR_SUSPENDED:
+ return("The queue thread has been suspended");
+ case DB_MISSING_HISTORY:
+ return("Required history data has been deleted");
+ case DB_CLUSTER_NOT_FOUND:
+ return("Cluster not found");
+ case DB_TABLE_NOT_FOUND:
+ return("Table not found");
+ case DB_MUST_GET_MORE_FILE_SPACE:
+ return("More file space needed");
+ case DB_TABLE_IS_BEING_USED:
+ return("Table is being used");
+ case DB_TOO_BIG_RECORD:
+ return("Record too big");
+ case DB_LOCK_WAIT_TIMEOUT:
+ return("Lock wait timeout");
+ case DB_NO_REFERENCED_ROW:
+ return("Referenced key value not found");
+ case DB_ROW_IS_REFERENCED:
+ return("Row is referenced");
+ case DB_CANNOT_ADD_CONSTRAINT:
+ return("Cannot add constraint");
+ case DB_CORRUPTION:
+ return("Data structure corruption");
+ case DB_COL_APPEARS_TWICE_IN_INDEX:
+ return("Column appears twice in index");
+ case DB_CANNOT_DROP_CONSTRAINT:
+ return("Cannot drop constraint");
+ case DB_NO_SAVEPOINT:
+ return("No such savepoint");
+ case DB_TABLESPACE_ALREADY_EXISTS:
+ return("Tablespace already exists");
+ case DB_TABLESPACE_DELETED:
+ return("No such tablespace");
+ case DB_LOCK_TABLE_FULL:
+ return("Lock structs have exhausted the buffer pool");
+ case DB_FOREIGN_DUPLICATE_KEY:
+ return("Foreign key activated with duplicate keys");
+ case DB_TOO_MANY_CONCURRENT_TRXS:
+ return("Too many concurrent transactions");
+ case DB_UNSUPPORTED:
+ return("Unsupported");
+ case DB_PRIMARY_KEY_IS_NULL:
+ return("Primary key is NULL");
+ case DB_STATS_DO_NOT_EXIST:
+ return("Persistent statistics do not exist");
+ case DB_FAIL:
+ return("Failed, retry may succeed");
+ case DB_OVERFLOW:
+ return("Overflow");
+ case DB_UNDERFLOW:
+ return("Underflow");
+ case DB_STRONG_FAIL:
+ return("Failed, retry will not succeed");
+ case DB_ZIP_OVERFLOW:
+ return("Zip overflow");
+ case DB_RECORD_NOT_FOUND:
+ return("Record not found");
+ case DB_END_OF_INDEX:
+ return("End of index");
+ /* do not add default: in order to produce a warning if new code
+ is added to the enum but not added here */
+ }
+
+ /* we abort here because if unknown error code is given, this could
+ mean that memory corruption has happened and someone's error-code
+ variable has been overwritten with bogus data */
+ ut_error;
+
+ /* NOT REACHED */
+ return("Unknown error");
+}
diff --git a/storage/innobase/ut/ut0wqueue.c b/storage/innobase/ut/ut0wqueue.c
index 5220d1e17f4..d32086bdfc4 100644
--- a/storage/innobase/ut/ut0wqueue.c
+++ b/storage/innobase/ut/ut0wqueue.c
@@ -35,7 +35,9 @@ ib_wqueue_create(void)
{
ib_wqueue_t* wq = mem_alloc(sizeof(ib_wqueue_t));
- mutex_create(&wq->mutex, SYNC_WORK_QUEUE);
+ /* Function ib_wqueue_create() has not been used anywhere,
+ not necessary to instrument this mutex */
+ mutex_create(PFS_NOT_INSTRUMENTED, &wq->mutex, SYNC_WORK_QUEUE);
wq->items = ib_list_create();
wq->event = os_event_create(NULL);